# read in docs
import os
from glob import glob
import numpy as np
import pandas as pd
from textparser import TextParser
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from langmod import NgramCounter
from langmod import NgramLanguageModel
import itertools
import seaborn as sns
import plotly.express as px
from numpy.linalg import norm
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from bow_tfidf_pca import create_bow, get_tfidf, get_pca
from prince import PCA
sns.set()
OHCO = ["book_id", "chap_id", "para_num", "sent_num", "token_num"]
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]
LIB = pd.read_csv("twain_pre_LIB.csv").set_index(BOOKS).sort_index()
CORPUS = pd.read_csv("twain_pre_CORPUS.csv").set_index(OHCO)
# remove NaN values
CORPUS = CORPUS[~CORPUS.term_str.isna()]
VOCAB = pd.read_csv("twain_pre_VOCAB.csv")
VOCAB['term_str'] = VOCAB['term_str'].astype('str')
VOCAB = VOCAB.set_index('term_str')
VOCAB
| n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||
| 0 | 5 | 1 | 1.683290e-06 | 19.180285 | CD | 1 | {'CD'} | 0 | 0 | 0 | 0 |
| 00 | 3 | 2 | 1.009974e-06 | 19.917251 | NN | 2 | {'NN', 'NNS'} | 0 | 00 | 00 | 00 |
| 01 | 3 | 2 | 1.009974e-06 | 19.917251 | NNS | 2 | {'NN', 'NNS'} | 0 | 01 | 01 | 01 |
| 02 | 4 | 2 | 1.346632e-06 | 19.502213 | NN | 3 | {'POS', 'NN', 'NNP'} | 0 | 02 | 02 | 02 |
| 03 | 6 | 2 | 2.019948e-06 | 18.917251 | NN | 3 | {'POS', 'NN', 'NNS'} | 0 | 03 | 03 | 03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| êtes | 1 | 4 | 3.366579e-07 | 21.502213 | NNS | 1 | {'NNS'} | 0 | ête | êtes | ête |
| être | 3 | 4 | 1.009974e-06 | 19.917251 | NNP | 2 | {'JJ', 'NNP'} | 0 | être | être | être |
| öffnen | 1 | 6 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | öffnen | öffnen | öffnen |
| über | 1 | 4 | 3.366579e-07 | 21.502213 | NNP | 1 | {'NNP'} | 0 | über | über | über |
| übergeschlagen | 1 | 14 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | übergeschlagen | übergeschlagen | übergeschl |
53854 rows × 11 columns
LIB
| source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | |
|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||
| 70 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 |
| 74 | Twain/74-the_adventures_of_tom_sawyer.txt | the adventures of tom sawyer | ^\s*CHAPTER\s*[IVXLCM]+$ | twain | novel | 1876 | 1870 | 35 | 70276 |
| 76 | Twain/76-the_adventures_of_huckleberry_finn.txt | the adventures of huckleberry finn | ^\s*CHAPTER\s*(?:[IVXLCM]+\.|THE LAST)$ | twain | novel | 1884 | 1880 | 43 | 111908 |
| 86 | Twain/86-a_connecticut_yankee_in_king_arthurs_... | a connecticut yankee in king arthurs court | ^\s*(?:PREFACE|A WORD OF EXPLANATION|THE STRAN... | twain | novel | 1889 | 1880 | 47 | 119100 |
| 91 | Twain/91-tom_sawyer_abroad.txt | tom sawyer abroad | CHAPTER\s[IVXLCM]+\. | twain | novel | 1894 | 1890 | 13 | 33969 |
| 93 | Twain/93-tom_sawyer_detective.txt | tom sawyer detective | ^CHAPTER\s[IVXLCM]+\.\s[A-Z] | twain | novel | 1896 | 1890 | 11 | 23372 |
| 102 | Twain/102-the_tragedy_of_puddnhead_wilson.txt | the tragedy of puddnhead wilson | ^(?:A Whisper|CHAPTER\s[IVXLCM]+\.|CONCLUSION)$ | twain | novel | 1894 | 1890 | 22 | 53935 |
| 119 | Twain/119-a_tramp_abroad.txt | a tramp abroad | ^(?:CHAPTER\s[IVXLCM]+|APPENDIX\s[A-Z]\.)$ | twain | non-fiction | 1880 | 1880 | 55 | 159402 |
| 142 | Twain/142-the_30000_bequest_and_other_stories.txt | the 30000 bequest and other stories | THE \$30,000 BEQUEST$|A DOG'S TALE$|WAS IT HEA... | twain | stories | 1906 | 1900 | 25 | 93670 |
| 245 | Twain/245-life_on_the_mississippi.txt | life on the mississippi | ^(THE 'BODY OF THE NATION'|CHAPTER\s[0-9]+|APP... | twain | non-fiction | 1883 | 1880 | 65 | 145691 |
| 1044 | Twain/1044-extract_from_captain_stormfields_vi... | extract from captain stormfields visit to Heaven | CHAPTER\s[IVXLCM]+$ | twain | stories | 1909 | 1900 | 2 | 15010 |
| 1086 | Twain/1086-a_horses_tale.txt | a horses tale | ^[IVXLCM]+$ | twain | novel | 1907 | 1900 | 15 | 17085 |
| 1837 | Twain/1837-the_prince_and_the_pauper.txt | the prince and the pauper | ^\s*CHAPTER\s*[IVXLCM]+ | twain | novel | 1881 | 1880 | 33 | 69786 |
| 2874 | Twain/2874-personal_recollections_of_joan_of_a... | personal recollections of joan of arc vol 1 | ^Chapter\s[0-9]+ | twain | non-fiction | 1896 | 1890 | 35 | 77803 |
| 2875 | Twain/2875-personal_recollections_of_joan_of_a... | personal recollections of joan of arc vol 2 | ^[0-9]+\s[A-Z]+ | twain | non-fiction | 1896 | 1890 | 38 | 71618 |
| 2895 | Twain/2895-following_the_equator.txt | following the equator | ^(CHAPTER[,]?\s[IVXLCM]+|CONCLUSION)\.$ | twain | non-fiction | 1897 | 1890 | 71 | 190158 |
| 3171 | Twain/3171-in_defense_of_harriet_shelley.txt | in defense of harriet shelley | ^[IVXLCM]+$ | twain | non-fiction | 1918 | 1910 | 3 | 15833 |
| 3172 | Twain/3172-fenimore_coopers_literary_offences.txt | fenimore coopers literary offences | The Pathfinder and The Deerslayer | twain | non-fiction | 1895 | 1890 | 1 | 4948 |
| 3173 | Twain/3173-essays_on_paul_bourget.txt | essays on paul bourget | (WHAT PAUL BOURGET|A LITTLE NOTE TO) | twain | non-fiction | 1890 | 1890 | 2 | 11035 |
| 3176 | Twain/3176-the_innocents_abroad.txt | the innocents abroad | ^\s*(CHAPTER\s*[IVXLCM]+\.$|CONCLUSION) | twain | non-fiction | 1869 | 1860 | 62 | 193699 |
| 3177 | Twain/3177-roughing_it.txt | roughing it | ^(CHAPTER\s[IVXLCM]+|APPENDIX)\.$ | twain | novel | 1872 | 1870 | 79 | 165350 |
| 3178 | Twain/3178-the_gilded_age.txt | the gilded age | ^(CHAPTER\s[IVXLCM]+|APPENDIX)\.$ | twain | novel | 1873 | 1870 | 64 | 160518 |
| 3179 | Twain/3179-the_american_claimant.txt | the american claimant | ^(CHAPTER\s[IVXLCM]+|APPENDIX)\.$ | twain | novel | 1892 | 1890 | 26 | 64036 |
| 3180 | Twain/3180-a_double_barrelled_detective_story.txt | a double barrelled detective story | ^[IVXLCM]+[\.]?$ | twain | stories | 1902 | 1900 | 10 | 19542 |
| 3181 | Twain/3181-the_stolen_white_elephant.txt | the stolen white elephant | ^[IVXLCM]+[\.]?$ | twain | stories | 1882 | 1880 | 3 | 6807 |
| 3182 | Twain/3182-some_rambling_notes_of_an_idle_excu... | some rambling notes of an idle excursion | ^[IVXLCM]+\.$ | twain | non-fiction | 1877 | 1870 | 4 | 16595 |
| 3183 | Twain/3183-the_facts_concerning_the_recent_car... | the facts concerning the recent carnival of cr... | I was feeling blithe | twain | stories | 1877 | 1870 | 1 | 6579 |
| 3184 | Twain/3184-alonzo_fitz_and_other_stories.txt | alonzo fitz and other stories | THE LOVES OF ALONZO FITZ CLARENCE AND ROSANNAH... | twain | stories | 1878 | 1870 | 13 | 30366 |
| 3185 | Twain/3185-those_extraordinary_twins.txt | those extraordinary twins | ^CHAPTER\s[IVXLCM]+\.\s[A-Z]+ | twain | stories | 1892 | 1890 | 10 | 20039 |
| 3186 | Twain/3186-the_mysterious_stranger_and_other_s... | the mysterious stranger and other stories | ^(Chapter\s[0-9]+|A FABLE|HUNTING THE DECEITFU... | twain | stories | 1916 | 1910 | 14 | 41793 |
| 3188 | Twain/3188-mark_twain_speeches.txt | mark twain speeches | INTRODUCTION$|PREFACE$|THE STORY OF A SPEECH$|... | twain | non-fiction | 1880 | 1880 | 105 | 92256 |
| 3189 | Twain/3189-sketches_new_and_old.txt | sketches new and old | MY WATCH|POLITICAL ECONOMY|THE JUMPING FROG|JO... | twain | stories | 1916 | 1910 | 52 | 97108 |
| 3190 | Twain/3190-1601_conversation_as_it_was_by_the_... | 1601 conversation as it was by the social fire... | ^(INTRODUCTION|THE FIRST PRINTING|FOOTNOTES|PA... | twain | stories | 1880 | 1880 | 4 | 11700 |
| 3191 | Twain/3191-goldsmiths_friend_abroad_again.txt | goldsmiths friend abroad again | LETTER\s[IVXLCM]+ | twain | stories | 1870 | 1870 | 7 | 6149 |
| 3192 | Twain/3192-the_curious_republic_of_gondour_and... | the curious republic of gondour and other whim... | THE CURIOUS REPUBLIC OF GONDOUR|A MEMORY|INTRO... | twain | stories | 1919 | 1910 | 14 | 16722 |
| 3199 | Twain/3199-the_letters_of_mark_twain.txt | the letters of mark twain | ^[IVXLCM]+\. [A-Z]+\s | twain | non-fiction | 1853 | 1850 | 31 | 272698 |
| 3250 | Twain/3250-how_to_tell_a_story_and_other_essay... | how to tell a story and other essays | HOW TO TELL A STORY$|THE WOUNDED SOLDIER.$|THE... | twain | non-fiction | 1897 | 1890 | 5 | 7420 |
| 3251 | Twain/3251-the_man_that_corrupted_hadleyburg_a... | the man that corrupted hadleyburg and other st... | ^THE MAN THAT CORRUPTED HADLEYBURG$|^MY FIRST ... | twain | stories | 1900 | 1900 | 15 | 112965 |
| 19484 | Twain/19484-editorial_wild_oats.txt | editorial wild oats | ^My First Literary Venture$|^Journalism in Ten... | twain | stories | 1875 | 1870 | 5 | 9777 |
| 19987 | Twain/19987-chapters_from_my_autobiography.txt | chapters from my autobiography | ^(INTRODUCTION|[IVXLCM]+|CHAPTERS FROM MY AUTO... | twain | non-fiction | 1906 | 1900 | 31 | 110834 |
| 33077 | Twain/33077-the_treaty_with_china_its_provisio... | the treaty with china its provisions explained | ^New York Tribune | twain | non-fiction | 1868 | 1860 | 1 | 7142 |
| 60900 | Twain/60900-merry_tales.txt | merry tales | ^THE PRIVATE HISTORY OF A CAMPAIGN THAT FAILED... | twain | stories | 1892 | 1890 | 6 | 36846 |
| 61522 | Twain/61522-the_1000000_bank_note.txt | the 1000000 bank note | ^_THE £1,000,000 BANK-NOTE_$|^_METNAL TELEGRAP... | twain | stories | 1893 | 1890 | 6 | 65207 |
| 62636 | Twain/62636-to_the_person_sitting_in_darkness.txt | to the person sitting in darkness | ^Extending the Blessings | twain | non-fiction | 1901 | 1900 | 1 | 4719 |
| 62739 | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | 1900 | 6 | 12797 |
CORPUS
| pos_tuple | pos | token_str | term_str | |||||
|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | para_num | sent_num | token_num | ||||
| 70 | 1 | 1 | 0 | 0 | ('By', 'IN') | IN | By | by |
| 1 | ('Mark', 'NNP') | NNP | Mark | mark | ||||
| 2 | ('Twain', 'NNP') | NNP | Twain | twain | ||||
| 2 | 0 | 0 | ('(Samuel', 'JJ') | JJ | (Samuel | samuel | ||
| 1 | ('Langhorne', 'NNP') | NNP | Langhorne | langhorne | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 6 | 13 | 0 | 8 | ("Leopold's", 'NNP') | NNP | Leopold's | leopolds |
| 9 | ('Soliloquy,', 'NNP') | NNP | Soliloquy, | soliloquy | ||||
| 10 | ('by', 'IN') | IN | by | by | ||||
| 11 | ('Mark', 'NNP') | NNP | Mark | mark | ||||
| 12 | ('Twain', 'NNP') | NNP | Twain | twain |
2970356 rows × 4 columns
V_TRAIN = sorted(list(set(VOCAB.index)))
len(V_TRAIN)
53853
# convert col type to str otherwise errors when generating training sentences
CORPUS['term_str'] = CORPUS.term_str.astype('str')
CORPUS['token_str'] = CORPUS.term_str.astype('str')
S_TRAIN = list(CORPUS.groupby(OHCO[:-1]).term_str.apply(lambda x: ' '.join(x)).values)
len(S_TRAIN)
155621
S_TRAIN[:5]
['by mark twain', 'samuel langhorne clemens 1835 1910', 'contents', 'what is man', 'the death of jean']
train = NgramCounter(S_TRAIN, V_TRAIN)
train.generate()
train.I
| w0 | w1 | w2 | ||
|---|---|---|---|---|
| sent_num | token_num | |||
| 0 | 0 | <s> | <s> | by |
| 1 | <s> | by | mark | |
| 2 | by | mark | twain | |
| 3 | mark | twain | </s> | |
| 4 | twain | </s> | <s> | |
| ... | ... | ... | ... | ... |
| 155620 | 11 | soliloquy | by | mark |
| 12 | by | mark | twain | |
| 13 | mark | twain | </s> | |
| 14 | twain | </s> | NaN | |
| 15 | </s> | NaN | NaN |
3437219 rows × 3 columns
stop_words = VOCAB.loc[VOCAB.stop == 1].index.values
unigram_df = train.LM[0].sort_values('n', ascending = False)
unigram_df
| n | mle | p | log_p | |
|---|---|---|---|---|
| w0 | ||||
| <s> | 311242 | 9.055053e-02 | 9.055053e-02 | -3.465133 |
| the | 161902 | 4.710261e-02 | 4.710261e-02 | -4.408049 |
| </s> | 155621 | 4.527526e-02 | 4.527526e-02 | -4.465133 |
| and | 128130 | 3.727723e-02 | 3.727723e-02 | -4.745562 |
| of | 82827 | 2.409710e-02 | 2.409710e-02 | -5.374997 |
| ... | ... | ... | ... | ... |
| jumieges | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| jument | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| jumblings | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| jumbling | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| übergeschlagen | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
53854 rows × 4 columns
unigram_df.filter(regex = '^[^<]', axis = 0)
| n | mle | p | log_p | |
|---|---|---|---|---|
| w0 | ||||
| the | 161902 | 4.710261e-02 | 4.710261e-02 | -4.408049 |
| and | 128130 | 3.727723e-02 | 3.727723e-02 | -4.745562 |
| of | 82827 | 2.409710e-02 | 2.409710e-02 | -5.374997 |
| a | 76650 | 2.230000e-02 | 2.230000e-02 | -5.486812 |
| to | 75666 | 2.201373e-02 | 2.201373e-02 | -5.505453 |
| ... | ... | ... | ... | ... |
| jumieges | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| jument | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| jumblings | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| jumbling | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
| übergeschlagen | 1 | 2.909329e-07 | 2.909329e-07 | -21.712810 |
53852 rows × 4 columns
bigram_df = train.LM[1].sort_values('n', ascending = False)
bigram_df
| n | mle | mle2 | p | log_p | ||
|---|---|---|---|---|---|---|
| w0 | w1 | |||||
| <s> | <s> | 155621 | 4.527528e-02 | 0.500000 | 0.426250 | -1.230230 |
| </s> | <s> | 155620 | 4.527499e-02 | 0.999994 | 0.742910 | -0.428741 |
| of | the | 18933 | 5.508234e-03 | 0.228585 | 0.138527 | -2.851762 |
| <s> | i | 14638 | 4.258677e-03 | 0.047031 | 0.040096 | -4.640387 |
| in | the | 14055 | 4.089063e-03 | 0.277954 | 0.134610 | -2.893140 |
| ... | ... | ... | ... | ... | ... | ... |
| hillfor | just | 1 | 2.909330e-07 | 1.000000 | 0.000037 | -14.716793 |
| hillock | at | 1 | 2.909330e-07 | 0.333333 | 0.000037 | -14.716846 |
| five | 1 | 2.909330e-07 | 0.333333 | 0.000037 | -14.716846 | |
| he | 1 | 2.909330e-07 | 0.333333 | 0.000037 | -14.716846 | |
| übergeschlagen | </s> | 1 | 2.909330e-07 | 1.000000 | 0.000037 | -14.716793 |
764796 rows × 5 columns
reduced_bigram = bigram_df.reset_index()
# remove spaces
reduced_bigram = reduced_bigram.loc[~((reduced_bigram.w0.str.contains('<')) | (reduced_bigram.w1.str.contains('<')))]
# remove stop words
reduced_bigram = reduced_bigram.loc[~((reduced_bigram.w0.isin(stop_words)) | (reduced_bigram.w1.isin(stop_words)))].set_index(['w0', 'w1'])
reduced_bigram
| n | mle | mle2 | p | log_p | ||
|---|---|---|---|---|---|---|
| w0 | w1 | |||||
| new | york | 738 | 2.147085e-04 | 0.244128 | 0.012993 | -6.266127 |
| mark | twain | 608 | 1.768872e-04 | 0.480253 | 0.011049 | -6.499990 |
| years | ago | 597 | 1.736870e-04 | 0.197095 | 0.010513 | -6.571708 |
| dont | know | 506 | 1.472121e-04 | 0.158571 | 0.008888 | -6.813971 |
| good | deal | 404 | 1.175369e-04 | 0.084149 | 0.006905 | -7.178188 |
| ... | ... | ... | ... | ... | ... | ... |
| hill | west | 1 | 2.909330e-07 | 0.002618 | 0.000037 | -14.726963 |
| wheeling | 1 | 2.909330e-07 | 0.002618 | 0.000037 | -14.726963 | |
| wherewith | 1 | 2.909330e-07 | 0.002618 | 0.000037 | -14.726963 | |
| work | 1 | 2.909330e-07 | 0.002618 | 0.000037 | -14.726963 | |
| hillock | five | 1 | 2.909330e-07 | 0.333333 | 0.000037 | -14.716846 |
325618 rows × 5 columns
trigram_df = train.LM[2].sort_values('n', ascending = False)
trigram_df
| n | mle | mle2 | p | log_p | |||
|---|---|---|---|---|---|---|---|
| w0 | w1 | w2 | |||||
| </s> | <s> | <s> | 155620 | 4.527500e-02 | 1.000000 | 5.365485e-05 | -14.185932 |
| <s> | <s> | i | 14638 | 4.258678e-03 | 0.094062 | 5.047219e-06 | -17.596080 |
| the | 12480 | 3.630844e-03 | 0.080195 | 4.303186e-06 | -17.826163 | ||
| it | 8015 | 2.331828e-03 | 0.051503 | 2.763748e-06 | -18.464942 | ||
| he | 7764 | 2.258804e-03 | 0.049890 | 2.677209e-06 | -18.510839 | ||
| ... | ... | ... | ... | ... | ... | ... | ... |
| harte | and | mr | 1 | 2.909330e-07 | 0.250000 | 6.895949e-10 | -30.433532 |
| i | 1 | 2.909330e-07 | 0.250000 | 6.895949e-10 | -30.433532 | ||
| getting | 1 | 2.909330e-07 | 0.250000 | 6.895949e-10 | -30.433532 | ||
| clemens | 1 | 2.909330e-07 | 0.250000 | 6.895949e-10 | -30.433532 | ||
| übergeschlagen | </s> | <s> | 1 | 2.909330e-07 | 1.000000 | 6.895949e-10 | -30.433532 |
1914877 rows × 5 columns
reduced_trigram = trigram_df.reset_index()
# remove spaces
reduced_trigram = reduced_trigram.loc[~((reduced_trigram.w0.str.contains('<')) | (reduced_trigram.w1.str.contains('<')) | (reduced_trigram.w2.str.contains('<')))]
# remove stop words
reduced_trigram = reduced_trigram.loc[~((reduced_trigram.w0.isin(stop_words)) | (reduced_trigram.w1.isin(stop_words)) | (reduced_trigram.w2.isin(stop_words)))].set_index(['w0', 'w1', 'w2'])
reduced_trigram
| n | mle | mle2 | p | log_p | |||
|---|---|---|---|---|---|---|---|
| w0 | w1 | w2 | |||||
| hundred | years | ago | 76 | 2.211091e-05 | 0.288973 | 2.654940e-08 | -25.166745 |
| wilsons | new | calendar | 74 | 2.152905e-05 | 1.000000 | 2.585981e-08 | -25.204713 |
| puddnhead | wilsons | new | 74 | 2.152905e-05 | 0.632479 | 2.585981e-08 | -25.204713 |
| twenty | four | hours | 70 | 2.036531e-05 | 0.569106 | 2.448062e-08 | -25.283785 |
| yrs | ever | mark | 55 | 1.600132e-05 | 0.833333 | 1.930866e-08 | -25.626177 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| harsher | methods | would | 1 | 2.909330e-07 | 1.000000 | 6.895949e-10 | -30.433532 |
| harsh | words | troubled | 1 | 2.909330e-07 | 1.000000 | 6.895949e-10 | -30.433532 |
| hart | came | along | 1 | 2.909330e-07 | 1.000000 | 6.895949e-10 | -30.433532 |
| harte | remarked | upon | 1 | 2.909330e-07 | 1.000000 | 6.895949e-10 | -30.433532 |
| became | editor | 1 | 2.909330e-07 | 1.000000 | 6.895949e-10 | -30.433532 |
157792 rows × 5 columns
reduced_trigram.head(10)
| n | mle | mle2 | p | log_p | |||
|---|---|---|---|---|---|---|---|
| w0 | w1 | w2 | |||||
| hundred | years | ago | 76 | 0.000022 | 0.288973 | 2.654940e-08 | -25.166745 |
| wilsons | new | calendar | 74 | 0.000022 | 1.000000 | 2.585981e-08 | -25.204713 |
| puddnhead | wilsons | new | 74 | 0.000022 | 0.632479 | 2.585981e-08 | -25.204713 |
| twenty | four | hours | 70 | 0.000020 | 0.569106 | 2.448062e-08 | -25.283785 |
| yrs | ever | mark | 55 | 0.000016 | 0.833333 | 1.930866e-08 | -25.626177 |
| mrs | jane | clemens | 41 | 0.000012 | 1.000000 | 1.448149e-08 | -26.041214 |
| fifty | years | ago | 38 | 0.000011 | 0.351852 | 1.344710e-08 | -26.148130 |
| puddnhead | wilsons | calendar | 38 | 0.000011 | 0.324786 | 1.344710e-08 | -26.148130 |
| five | years | ago | 36 | 0.000010 | 0.251748 | 1.275751e-08 | -26.224078 |
| hundred | years | 35 | 0.000010 | 0.204678 | 1.241271e-08 | -26.263607 |
LIB.type.value_counts()
non-fiction 17 stories 17 novel 11 Name: type, dtype: int64
# generate n-gram model using n-gram counter
model = NgramLanguageModel(train)
# implement smoothing (with k = 1) to prevent zero prob. for n-grams unseen in training data
model.k = 1
model.apply_smoothing()
# list of ngram token tables for unigram, bigram, trigram
ngram = model.NG
# list of unigram, bigram, trigram dfs with count, max likelihood estimate (MLE), joint prob, log joint prob, conditional prob, log conditional prob
LM = model.LM
Z1 = model.Z1
Z2 = model.Z2
.generate_text() method of the langmod.NgramLanguageModel object (model)¶model.generate_text()
01. COMFORTABLE GOOD SERVICE OF ANY KIND OF FANCY PERSIAN STUFF FILLED WITH HUMAN ASPECTS REMOVED OH VERY WELL FORTIFIED WITH RESPONSIBLE APPETITES AND GODLESS ATTITUDES OF THE LITTLE CREATURES AS ALL THOSE PEOPLE A LIMPID TORRENT GOES WHISTLING DOWN THE LIGHTNING HAS STRUCK A FINAL STRUGGLE FOR INDEPENDENCE AND THAT IS YOU CANNOT SEE THE BOIL SCAR UNDER ARMPIT. 02. AND THERE STOOD FATHER PETER HAD BEEN A DEAL FOR HE HAS OPENED HIS BOX AND PUT HIS HAND ON A HALF AND WOULD LIKE TO SAMPLE IT I TELL YOU IT WAS A POOR LITTLE CHAPAND WONDERING WHAT WAS IN OXFORD STREET JUST THENIT HAD BEEN ASLEEP. 03. DO YOU REMEMBER THAT HE IS IRRELEVANT AND ALSO FOR A CRIPPLED OCTOGENARIAN WHO COULD HAVE QUALIFIED AS A MANS HEAD ON THE CURBSTONES WERE STILL THE END OF THAT OLD MAN BECAME EXHAUSTED AND SPIRITUALLY RECONCILED. 04. BUT I KNOW THAT NOR THE LIE HE WOULD HE ASK THE INDIANS. 05. IN TIME FOR A LITTLE STRONGER INCLINATION LIKE THAT ONCE STOOD A MOMENT BEFORE AND AFTER THE DATE AND WE LOOKED DOWN UPON THE JEW HAS REAL NEED FOR HE PRESENTLY DIED A QUICK ESCAPE WHILE THE COST OF ADVERTISING THE LECTURE FIELD. 06. THEY HAVE NOT SUFFERED FROM CORPULENCE AND HAD NOT BEFORE HE ASKED. 07. WELL WE WERE COMPROMISED BY IT ALWAYS MAKES ME FEEL MUCH TROUBLED AND DESPONDENT MOREOVER HE WAS JUST WHAT I REGARDED THAT AS A MATTER OF HUGE WILD BOARS HEADS PRESERVED AND PERPETUATED AND WITH THEM AND TOOK HER TO REARRANGE THEM TO SHOW WHO WAS PRESENT HALF OF IT. 08. THIS COULD NOT CONSENT TO A DISTANCE BECAUSE ALTHOUGH HE DID NOT KNOW HOW TO SNATCH THE OLD TESTAMENT FOR A TRUNK AND GAVE MAMMA HER TICKET AND WITH A PATHETIC CONFUSION THEN GOT UP AND PUT IT WHERE IT CAN DO THAT BUT YOU WILL FIND IT VALUABLE. 09. SHE SAYS. 10. OH IT WAS THOSE THUGS HAVE BUILT MONUMENTS ON THEIR PRECIOUS ACCUMULATIONSAND AFTER ALL. 11. AFTER HIS RECOVERY FROM AN ENGLISH TOURIST RAILROAD UP THE INDIGNANT BRITISH LION ROSE WITH STEM THORN LEAVES PETALS COMPLETE AND THE QUIET REPLY GET YOUR OWN DEPUTIES AND NOT ONLY GLAD. 12. HE SAID SHE WOULD DECEIVE THE UNWARY. 13. AS IT IS DANGEROUS TO REFER TO THE RESPONSIBILITIES ON MYSELF. 14. NO. 15. HE SAID THE PUNISHMENT UPON THE PASSER BY WITH THE COLD AND HE ENJOYS RIDING ON A COMET. 16. THAT OFFICER MUST SUFFER IF WE LIKED WHETHER IT IS DELIVERED AND DAMNABLE PORTRAITS OF THE PILOT HOUSE AND I SHALL BE HELD BY THE LOAD UPON MY CONSCIENCE BEGAN TO PLAN THINGS AND NOT NEEDLESSLY ELABORATED YOUR REPORT OF THE EVASION WORKED ALL RIGHT ALL RIGHTHAVE IT YOUR SUBTLE PERSUASIONS THAT HAVE BEEN INNUMERABLE TEMPORARY SEEKERS OF 49 OR MAYBE HALF A MILE ALWAYS FOLLOWING PATHS WHICH HAD BEFALLEN TOM HAD NEVER SUFFERED. 17. WHO SAID PROCRASTINATION IS THE ENTIRE NATIONAL CHARACTER THAT IS THAT WHAT HIS HISTORY CREATING REVOLVER. 18. THE FOLLOWING LETTER. 19. I WAS AWAY FROM THE PARIS AND EVERYWHERE. 20. BUT WHEN IT IS TEN DOLLARS AND I JUMPED IN AND YEAR OUT OF WORK IN HER COUNTENANCE SHOWED UNCOMMON VIVACITY WITH A CHORUS FROM ALL THE HOUSES HAVE VANISHED.
.mle in the unigram, bigram, and trigram tables in the language modelV = len(VOCAB)
R = []
for i in range(3):
N = V**(i+1)
H = (train.LM[i]['mle'] * np.log2(1/train.LM[i]['mle'])).sum()
Hmax = np.log2(N)
R.append(int(round(1 - H/Hmax, 2) * 100))
R
[41, 50, 60]
BGX = model.LM[1].n.unstack() so use method below), explore the relationship between bigram pairs using the following lists for the first and second words of the bigrams of interest¶w0 = ['he', 'she']
w1 = ['said', 'heard']
bigram_pairs = [i for i in itertools.combinations(w0 + w1, 2) if i[0] in w0 and i[1] in w1]
LM[1].loc[bigram_pairs].n.unstack()
| w1 | said | heard |
|---|---|---|
| w0 | ||
| he | 1644 | 66 |
| she | 533 | 23 |
VOCAB.n.sort_values().plot(ylabel = "log_frequency", logy=True, style = '.', rot = 45, title = "Log Term Frequency");
VOCAB¶if 'term_rank' not in VOCAB.columns:
VOCAB = VOCAB.sort_values('n', ascending = False).reset_index()
VOCAB.index.name = 'term_rank'
VOCAB = VOCAB.reset_index()
VOCAB['term_rank'] = VOCAB['term_rank'] + 1
VOCAB = VOCAB.set_index('term_str')
VOCAB.term_rank.plot(ylabel = "term_rank", logx = False, rot = 45, title = "Term Rank");
VOCAB.head()
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||
| the | 1 | 161902 | 3 | 0.054506 | 4.197452 | DT | 20 | {'RP', 'NNS', 'FW', 'VBG', 'RB', 'JJ', 'PRP', ... | 1 | the | the | the |
| and | 2 | 128130 | 3 | 0.043136 | 4.534964 | CC | 17 | {'RP', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'POS'... | 1 | and | and | and |
| of | 3 | 82827 | 2 | 0.027884 | 5.164400 | IN | 17 | {'PDT', 'RP', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP'... | 1 | of | of | of |
| a | 4 | 76650 | 1 | 0.025805 | 5.276215 | DT | 19 | {'RP', 'NNS', 'FW', 'RB', 'JJ', 'NNPS', 'PRP',... | 1 | a | a | a |
| to | 5 | 75666 | 2 | 0.025474 | 5.294856 | TO | 18 | {'PDT', 'RP', 'VBZ', 'JJR', 'NNS', 'TO', 'VB',... | 1 | to | to | to |
# times each num of times a term appears (e.g., 18273 terms appear 1 time)
# sort in descending order
# reset indices and rename cols --> nn: times each num of times a term appears
new_rank = VOCAB.n.value_counts()\
.sort_index(ascending = False).reset_index().reset_index()\
.rename(columns={'level_0': 'term_rank2', 'index': 'n', 'n': 'nn'})\
.set_index('n')
VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
VOCAB.term_rank2.plot(ylabel = 'term_rank2', logx = False, rot = 45, title = "Term Rank 2 (Words with Same Frequency Assigned Equal Rank)");
term_rank and term_rank2¶VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k2'] = VOCAB.n * VOCAB.term_rank2
VOCAB.zipf_k.plot(style = ',', rot = 45);
VOCAB.zipf_k2.plot(style = ',', rot = 45);
n)¶As rank (term_rank2) increases, frequnecy (n) decreases
# scatter plot of term_rank2 vs. n color coded by part of speech (POS)
px.scatter(VOCAB.reset_index(),
x = 'term_rank2', y = 'n',
title = 'Term Rank (2) vs. Frequency (n)',
log_y = False, log_x = False,
hover_name = 'term_str',
color = 'max_pos',
height = 500, width = 800)
BOW = create_bow(CORPUS, CHAPS)
DTCM, TFIDF, BOW, DFIDF, VOCAB = get_tfidf(BOW, VOCAB, tf_method = 'max', idf_method = 'standard')
VOCAB
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| the | 1 | 161902 | 3 | 5.450559e-02 | 4.197452 | DT | 20 | {'RP', 'NNS', 'FW', 'VBG', 'RB', 'JJ', 'PRP', ... | 1 | the | the | the | 1 | 161902 | 161902 | 0.002433 | 0.002606 | 1106.0 | 0.002606 | 2.882784 |
| and | 2 | 128130 | 3 | 4.313598e-02 | 4.534964 | CC | 17 | {'RP', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'POS'... | 1 | and | and | and | 2 | 256260 | 256260 | 0.003973 | 0.005218 | 1104.0 | 0.005218 | 5.760351 |
| of | 3 | 82827 | 2 | 2.788437e-02 | 5.164400 | IN | 17 | {'PDT', 'RP', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP'... | 1 | of | of | of | 3 | 248481 | 248481 | 0.001907 | 0.003912 | 1105.0 | 0.003912 | 4.322221 |
| a | 4 | 76650 | 1 | 2.580483e-02 | 5.276215 | DT | 19 | {'RP', 'NNS', 'FW', 'RB', 'JJ', 'NNPS', 'PRP',... | 1 | a | a | a | 4 | 306600 | 306600 | 0.003668 | 0.007834 | 1102.0 | 0.007834 | 8.632691 |
| to | 5 | 75666 | 2 | 2.547356e-02 | 5.294856 | TO | 18 | {'PDT', 'RP', 'VBZ', 'JJR', 'NNS', 'TO', 'VB',... | 1 | to | to | to | 5 | 378330 | 378330 | 0.001778 | 0.003912 | 1105.0 | 0.003912 | 4.322221 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ouvre | 53850 | 1 | 5 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | ouvr | ouvr | ouvr | 945 | 53850 | 945 | 0.041792 | 0.041792 | 1.0 | 10.113742 | 10.113742 |
| outworks | 53851 | 1 | 8 | 3.366579e-07 | 21.502213 | NNS | 1 | {'NNS'} | 0 | outwork | outwork | outwork | 945 | 53851 | 945 | 0.066103 | 0.066103 | 1.0 | 10.113742 | 10.113742 |
| outwitted | 53852 | 1 | 9 | 3.366579e-07 | 21.502213 | JJ | 1 | {'JJ'} | 0 | outwit | outwit | outwit | 945 | 53852 | 945 | 0.056187 | 0.056187 | 1.0 | 10.113742 | 10.113742 |
| outvoted | 53853 | 1 | 8 | 3.366579e-07 | 21.502213 | VBD | 1 | {'VBD'} | 0 | outvot | outvot | outvot | 945 | 53853 | 945 | 0.091115 | 0.091115 | 1.0 | 10.113742 | 10.113742 |
| übergeschlagen | 53854 | 1 | 14 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | übergeschlagen | übergeschlagen | übergeschl | 945 | 53854 | 945 | 0.046607 | 0.046607 | 1.0 | 10.113742 | 10.113742 |
53854 rows × 20 columns
DTCM¶DTCM
| term_str | 0 | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | ... | étant | éternumens | étouffante | étranger | évitant | êtes | être | öffnen | über | übergeschlagen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1108 rows × 53852 columns
VOCAB, TFIDF matrix to the 1000 most significant terms¶# open POS categories
open_cats = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ',
'JJR', 'JJS', 'RB', 'RBR', 'RBS']
# reduce VOCAB to significant terms --> filter POS, sort , take top 1000
SIGS = VOCAB.loc[VOCAB.max_pos.isin(open_cats)] \
.sort_values('dfidf', ascending = False) \
.iloc[:1000,]
SIGS
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| saying | 401 | 705 | 6 | 0.000237 | 12.040734 | VBG | 12 | {'VBG', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'IN'... | 0 | say | say | say | 374 | 282705 | 263670 | 0.017010 | 0.096088 | 408.0 | 1.441317 | 588.057264 |
| seem | 378 | 752 | 4 | 0.000253 | 11.947624 | VB | 7 | {'VBZ', 'VB', 'NNP', 'JJ', 'VBP', 'CC', 'NN'} | 0 | seem | seem | seem | 354 | 284256 | 266208 | 0.017707 | 0.222286 | 407.0 | 1.444857 | 588.056873 |
| indeed | 373 | 758 | 6 | 0.000255 | 11.936159 | NN | 12 | {'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'POS', 'IN'... | 0 | inde | inde | indee | 351 | 282734 | 266058 | 0.018638 | 0.220656 | 410.0 | 1.434262 | 588.047447 |
| couldnt | 246 | 1219 | 7 | 0.000410 | 11.250731 | VBP | 11 | {'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'RB', 'MD',... | 0 | couldnt | couldnt | couldnt | 241 | 299874 | 293779 | 0.029341 | 0.181496 | 405.0 | 1.451964 | 588.045448 |
| door | 319 | 885 | 4 | 0.000298 | 11.712680 | NN | 8 | {'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'FW', 'JJ',... | 0 | door | door | door | 306 | 282315 | 270810 | 0.021854 | 0.290393 | 405.0 | 1.451964 | 588.045448 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| record | 1249 | 215 | 6 | 0.000072 | 13.754020 | NN | 6 | {'VBZ', 'VB', 'NNP', 'VBP', 'JJ', 'NN'} | 0 | record | record | record | 731 | 268535 | 157165 | 0.027074 | 0.269454 | 142.0 | 2.963995 | 420.887297 |
| watched | 1423 | 188 | 7 | 0.000063 | 13.947624 | VBD | 8 | {'VB', 'VBD', 'IN', 'VBN', 'JJR', 'VBP', 'JJ',... | 0 | watch | watch | watch | 758 | 267524 | 142504 | 0.025564 | 0.158080 | 142.0 | 2.963995 | 420.887297 |
| laws | 1231 | 219 | 4 | 0.000074 | 13.727426 | NNS | 8 | {'VBZ', 'NNS', 'JJS', 'VB', 'NNP', 'IN', 'JJ',... | 0 | law | law | law | 727 | 269589 | 159213 | 0.033740 | 0.635142 | 142.0 | 2.963995 | 420.887297 |
| information | 1484 | 180 | 11 | 0.000061 | 14.010360 | NN | 4 | {'JJ', 'VB', 'NN', 'NNS'} | 0 | inform | inform | inform | 766 | 267120 | 137880 | 0.024242 | 0.104000 | 142.0 | 2.963995 | 420.887297 |
| questions | 1276 | 212 | 9 | 0.000071 | 13.774293 | NNS | 7 | {'NNS', 'VB', 'NNP', 'RB', 'VBP', 'JJ', 'NN'} | 0 | question | question | quest | 734 | 270512 | 155608 | 0.033022 | 0.395199 | 142.0 | 2.963995 | 420.887297 |
1000 rows × 20 columns
SIGS.head(10).index.values
array(['saying', 'seem', 'indeed', 'couldnt', 'door', 'taken', 'deal',
'fifty', 'getting', 'perhaps'], dtype=object)
TFIDF
| term_str | 0 | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | ... | étant | éternumens | étouffante | étranger | évitant | êtes | être | öffnen | über | übergeschlagen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| 6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1108 rows × 53852 columns
TFIDF_sigs = TFIDF[SIGS.index]
TFIDF_sigs
| term_str | saying | seem | indeed | couldnt | door | taken | deal | fifty | getting | perhaps | ... | shame | dawn | privilege | loved | busy | record | watched | laws | information | questions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| 2 | 0.005272 | 0.005285 | 0.005246 | 0.010622 | 0.001062 | 0.004187 | 0.002093 | 0.004187 | 0.002130 | 0.014907 | ... | 0.013009 | 0.000000 | 0.002168 | 0.002168 | 0.002168 | 0.006505 | 0.006505 | 0.006505 | 0.0 | 0.002168 | |
| 3 | 0.006831 | 0.000000 | 0.000000 | 0.006881 | 0.055051 | 0.000000 | 0.000000 | 0.006781 | 0.006898 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.042142 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 4 | 0.005859 | 0.000000 | 0.005830 | 0.005902 | 0.000000 | 0.000000 | 0.011632 | 0.017448 | 0.000000 | 0.011834 | ... | 0.000000 | 0.012049 | 0.024098 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.012049 | |
| 5 | 0.008429 | 0.012674 | 0.004194 | 0.012737 | 0.004246 | 0.004183 | 0.004183 | 0.012550 | 0.008512 | 0.008512 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.016015 | 0.008027 | 0.003984 | 0.004033 | 0.000000 | 0.003974 | 0.003974 | 0.000000 | 0.000000 | 0.000000 | ... | 0.024700 | 0.000000 | 0.000000 | 0.000000 | 0.008233 | 0.024700 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| 3 | 0.045041 | 0.045152 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.044711 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 4 | 0.005093 | 0.005106 | 0.005068 | 0.010261 | 0.005131 | 0.000000 | 0.000000 | 0.005056 | 0.005143 | 0.000000 | ... | 0.010473 | 0.000000 | 0.000000 | 0.000000 | 0.010473 | 0.010473 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.013626 | 0.013626 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 6 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.043356 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
1108 rows × 1000 columns
print('max term TFIDF value:', max(TFIDF_sigs.sum(axis=0)))
print('max TFIDF term:', TFIDF_sigs.sum(axis = 0).idxmax())
max term TFIDF value: 20.442455786925322 max TFIDF term: dont
print('Max total TFIDF value by book and chapter:', max(TFIDF_sigs.sum(axis=1)))
print('(Book_id, chap_id) with max total TFIDF: ', TFIDF_sigs.sum(axis = 1).idxmax())
print('Title of book with max total TFIDF:', LIB.loc[TFIDF_sigs.sum(axis = 1).idxmax()[0]].title.title())
Max total TFIDF value by book and chapter: 10.103244296859762 (Book_id, chap_id) with max total TFIDF: (102, 4) Title of book with max total TFIDF: The Tragedy Of Puddnhead Wilson
px.scatter(VOCAB.reset_index(),
x = 'term_rank2', y = 'tfidf_mean_chap_max',
title = 'Term Rank vs. TFIDF Mean (with chaps as bags of words, max TF method)',
color = 'max_pos', size = 'n_pos',
hover_name = 'term_str', hover_data = ['n', 'i'],
log_y = True, log_x = False)
px.scatter(VOCAB.reset_index(),
x = 'term_rank2', y = 'dfidf',
title = 'Term Rank vs. DFIDF',
color = 'max_pos', size = 'n_pos',
hover_name = 'term_str', hover_data = ['n', 'i'])
# group by book_id (BOOKS = OHCO[:1])
mean_TFIDF = TFIDF.groupby(BOOKS).mean()
mean_TFIDF
| term_str | 0 | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | ... | étant | éternumens | étouffante | étranger | évitant | êtes | être | öffnen | über | übergeschlagen |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||||||||||||
| 70 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 74 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 76 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 86 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 91 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 93 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 102 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 119 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 142 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 245 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1044 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1086 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1837 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2874 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2875 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2895 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3171 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3172 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3173 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3176 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3177 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3178 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3179 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3180 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3181 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3182 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3183 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3184 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3185 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3186 | 0.000000 | 0.000000 | 0.000000 | 0.003557 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3188 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3189 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000804 | 0.000804 | 0.000000 | 0.001607 | 0.000804 | 0.000804 | 0.002411 | 0.000000 | 0.000000 | 0.000000 |
| 3190 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3191 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3192 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3199 | 0.002481 | 0.004061 | 0.004469 | 0.004240 | 0.008474 | 0.011872 | 0.003477 | 0.002089 | 0.00178 | 0.005932 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3250 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3251 | 0.002787 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 19484 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 19987 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000899 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 33077 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 60900 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.013067 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.007768 | 0.007768 | 0.007768 |
| 61522 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 62636 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 62739 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
45 rows × 53852 columns
mean_TFIDF_sigs = TFIDF_sigs.groupby(BOOKS).mean()
mean_TFIDF_sigs
| term_str | saying | seem | indeed | couldnt | door | taken | deal | fifty | getting | perhaps | ... | shame | dawn | privilege | loved | busy | record | watched | laws | information | questions |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||||||||||||
| 70 | 0.002214 | 0.005691 | 0.003876 | 0.005907 | 0.007669 | 0.005358 | 0.004758 | 0.009586 | 0.001288 | 0.005993 | ... | 0.003571 | 0.001598 | 0.004269 | 0.003706 | 0.003537 | 0.000905 | 0.000383 | 0.002605 | 0.004543 | 0.005220 |
| 74 | 0.006860 | 0.005358 | 0.005679 | 0.010177 | 0.018570 | 0.003364 | 0.001466 | 0.002715 | 0.010265 | 0.002536 | ... | 0.003079 | 0.002429 | 0.001020 | 0.006155 | 0.003242 | 0.001439 | 0.010875 | 0.002685 | 0.000952 | 0.002359 |
| 76 | 0.011842 | 0.008150 | 0.000312 | 0.051429 | 0.017287 | 0.000202 | 0.003498 | 0.002909 | 0.011204 | 0.000130 | ... | 0.001816 | 0.001550 | 0.000000 | 0.001259 | 0.000312 | 0.001902 | 0.005608 | 0.002546 | 0.000621 | 0.005413 |
| 86 | 0.004200 | 0.009052 | 0.008176 | 0.028338 | 0.009332 | 0.005638 | 0.007524 | 0.005537 | 0.006675 | 0.003103 | ... | 0.003548 | 0.005953 | 0.000699 | 0.000719 | 0.002557 | 0.002697 | 0.004592 | 0.019162 | 0.002403 | 0.005275 |
| 91 | 0.009653 | 0.012530 | 0.000000 | 0.073216 | 0.001092 | 0.000000 | 0.005023 | 0.005192 | 0.009605 | 0.000757 | ... | 0.001065 | 0.004166 | 0.000000 | 0.001416 | 0.004166 | 0.000000 | 0.005277 | 0.005399 | 0.000000 | 0.001163 |
| 93 | 0.005316 | 0.006406 | 0.000000 | 0.049434 | 0.016613 | 0.000000 | 0.005417 | 0.001492 | 0.018616 | 0.000000 | ... | 0.000524 | 0.006596 | 0.000000 | 0.005538 | 0.001821 | 0.000000 | 0.009421 | 0.004995 | 0.000000 | 0.009122 |
| 102 | 0.014221 | 0.001967 | 0.006542 | 0.020268 | 0.018873 | 0.006466 | 0.004435 | 0.002955 | 0.006372 | 0.003764 | ... | 0.007376 | 0.009475 | 0.001214 | 0.005499 | 0.001247 | 0.004911 | 0.007838 | 0.012873 | 0.000000 | 0.010592 |
| 119 | 0.004359 | 0.008560 | 0.005958 | 0.003688 | 0.006101 | 0.003824 | 0.006648 | 0.008816 | 0.004698 | 0.007719 | ... | 0.000762 | 0.003765 | 0.001968 | 0.000807 | 0.002459 | 0.003095 | 0.005096 | 0.003474 | 0.004153 | 0.001234 |
| 142 | 0.007268 | 0.004428 | 0.005972 | 0.003653 | 0.004329 | 0.004804 | 0.003849 | 0.008807 | 0.005343 | 0.006197 | ... | 0.003847 | 0.001328 | 0.005567 | 0.005362 | 0.003762 | 0.005743 | 0.001111 | 0.000177 | 0.002947 | 0.005733 |
| 245 | 0.003502 | 0.004068 | 0.002828 | 0.006774 | 0.006710 | 0.005372 | 0.006845 | 0.011155 | 0.006189 | 0.006840 | ... | 0.000774 | 0.001678 | 0.002992 | 0.004301 | 0.000807 | 0.001385 | 0.003055 | 0.003029 | 0.005406 | 0.003130 |
| 1044 | 0.010437 | 0.020477 | 0.004029 | 0.062185 | 0.002039 | 0.000000 | 0.019964 | 0.010987 | 0.006451 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.009623 | 0.009623 | 0.000000 |
| 1086 | 0.011302 | 0.003689 | 0.009535 | 0.029505 | 0.002482 | 0.000000 | 0.004162 | 0.001445 | 0.001688 | 0.002092 | ... | 0.002501 | 0.006483 | 0.001267 | 0.009410 | 0.004703 | 0.001080 | 0.000000 | 0.000000 | 0.001080 | 0.002102 |
| 1837 | 0.012585 | 0.007149 | 0.012676 | 0.000000 | 0.012856 | 0.006845 | 0.003559 | 0.001468 | 0.000861 | 0.000183 | ... | 0.007741 | 0.003273 | 0.004498 | 0.010069 | 0.004864 | 0.003048 | 0.003467 | 0.011345 | 0.005612 | 0.006456 |
| 2874 | 0.015162 | 0.006606 | 0.010654 | 0.006011 | 0.005050 | 0.005317 | 0.004128 | 0.002257 | 0.005871 | 0.003985 | ... | 0.002938 | 0.013244 | 0.007684 | 0.004001 | 0.005845 | 0.003880 | 0.003955 | 0.000901 | 0.000000 | 0.003481 |
| 2875 | 0.012666 | 0.003254 | 0.010652 | 0.002297 | 0.001937 | 0.012761 | 0.001903 | 0.004802 | 0.001641 | 0.006691 | ... | 0.006312 | 0.007214 | 0.002059 | 0.007591 | 0.005931 | 0.007519 | 0.003081 | 0.001453 | 0.000000 | 0.020572 |
| 2895 | 0.001588 | 0.007035 | 0.007057 | 0.004420 | 0.005550 | 0.006149 | 0.005118 | 0.007155 | 0.004609 | 0.006185 | ... | 0.003532 | 0.002955 | 0.004582 | 0.002643 | 0.001960 | 0.006552 | 0.001279 | 0.004053 | 0.002712 | 0.000622 |
| 3171 | 0.003821 | 0.017271 | 0.027987 | 0.000000 | 0.007740 | 0.013207 | 0.003190 | 0.002218 | 0.012272 | 0.025470 | ... | 0.006609 | 0.000000 | 0.009148 | 0.040620 | 0.004595 | 0.000000 | 0.004553 | 0.000000 | 0.004553 | 0.004595 |
| 3172 | 0.004252 | 0.008524 | 0.004231 | 0.000000 | 0.000000 | 0.004220 | 0.004220 | 0.021102 | 0.000000 | 0.004294 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.008743 | 0.008743 | 0.008743 | 0.000000 | 0.000000 |
| 3173 | 0.015125 | 0.009352 | 0.017372 | 0.000000 | 0.004296 | 0.000000 | 0.002315 | 0.006945 | 0.010968 | 0.009017 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.004796 | 0.008769 | 0.013565 |
| 3176 | 0.001588 | 0.004203 | 0.000659 | 0.000562 | 0.004121 | 0.004841 | 0.005115 | 0.006773 | 0.005406 | 0.003615 | ... | 0.002576 | 0.001870 | 0.002391 | 0.001107 | 0.001318 | 0.002640 | 0.004249 | 0.000737 | 0.004758 | 0.002800 |
| 3177 | 0.003114 | 0.005003 | 0.006568 | 0.002544 | 0.005879 | 0.007228 | 0.005472 | 0.011755 | 0.003649 | 0.005145 | ... | 0.001418 | 0.005038 | 0.002063 | 0.000321 | 0.004459 | 0.003582 | 0.004551 | 0.001954 | 0.005832 | 0.002295 |
| 3178 | 0.005395 | 0.008850 | 0.019107 | 0.004711 | 0.010841 | 0.006513 | 0.009886 | 0.002842 | 0.005094 | 0.025737 | ... | 0.003037 | 0.001381 | 0.001287 | 0.010739 | 0.007819 | 0.005133 | 0.002626 | 0.001405 | 0.004985 | 0.002939 |
| 3179 | 0.007909 | 0.009067 | 0.014630 | 0.023126 | 0.007769 | 0.003826 | 0.009298 | 0.004420 | 0.007689 | 0.011021 | ... | 0.012363 | 0.002986 | 0.004664 | 0.002803 | 0.002266 | 0.000679 | 0.003891 | 0.005122 | 0.002008 | 0.004712 |
| 3180 | 0.009681 | 0.002094 | 0.004293 | 0.004316 | 0.006482 | 0.004786 | 0.001616 | 0.005246 | 0.003621 | 0.006037 | ... | 0.013996 | 0.004234 | 0.007141 | 0.007229 | 0.003222 | 0.000000 | 0.007799 | 0.002117 | 0.003545 | 0.005526 |
| 3181 | 0.008897 | 0.003345 | 0.020116 | 0.000000 | 0.003559 | 0.011429 | 0.003507 | 0.016560 | 0.003369 | 0.008060 | ... | 0.000000 | 0.000000 | 0.014126 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.009148 | 0.000000 |
| 3182 | 0.007100 | 0.004571 | 0.002464 | 0.007197 | 0.003550 | 0.005521 | 0.015412 | 0.009644 | 0.005714 | 0.003354 | ... | 0.002964 | 0.006544 | 0.004283 | 0.000000 | 0.007863 | 0.002964 | 0.002546 | 0.002546 | 0.000000 | 0.005093 |
| 3183 | 0.000000 | 0.005688 | 0.005647 | 0.005716 | 0.068597 | 0.000000 | 0.011266 | 0.011266 | 0.005730 | 0.017191 | ... | 0.023339 | 0.000000 | 0.000000 | 0.011669 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.023339 | 0.035008 |
| 3184 | 0.009640 | 0.006315 | 0.010998 | 0.001801 | 0.008365 | 0.005465 | 0.006711 | 0.016163 | 0.004563 | 0.005159 | ... | 0.006125 | 0.002465 | 0.006535 | 0.011737 | 0.002054 | 0.003847 | 0.000000 | 0.006089 | 0.003167 | 0.012000 |
| 3185 | 0.004623 | 0.008226 | 0.011253 | 0.015807 | 0.003437 | 0.004922 | 0.008367 | 0.000000 | 0.012881 | 0.001111 | ... | 0.007879 | 0.000000 | 0.011095 | 0.005812 | 0.000000 | 0.000000 | 0.000000 | 0.009074 | 0.000000 | 0.001029 |
| 3186 | 0.012936 | 0.005536 | 0.011350 | 0.011731 | 0.006083 | 0.004225 | 0.000426 | 0.001164 | 0.013767 | 0.003737 | ... | 0.002950 | 0.006137 | 0.002039 | 0.003903 | 0.006560 | 0.000647 | 0.008862 | 0.000511 | 0.000743 | 0.010033 |
| 3188 | 0.008052 | 0.007669 | 0.006826 | 0.009173 | 0.007591 | 0.005700 | 0.005608 | 0.009260 | 0.005486 | 0.007247 | ... | 0.001674 | 0.000838 | 0.010647 | 0.004021 | 0.004348 | 0.002296 | 0.001479 | 0.006154 | 0.002698 | 0.004198 |
| 3189 | 0.003429 | 0.005742 | 0.002021 | 0.005877 | 0.010844 | 0.005124 | 0.007386 | 0.006111 | 0.007227 | 0.004559 | ... | 0.005278 | 0.000492 | 0.001196 | 0.012707 | 0.000000 | 0.002477 | 0.001086 | 0.007143 | 0.005123 | 0.002480 |
| 3190 | 0.005380 | 0.002985 | 0.013965 | 0.000000 | 0.003403 | 0.001987 | 0.002236 | 0.007621 | 0.001137 | 0.009476 | ... | 0.000000 | 0.000000 | 0.000000 | 0.006947 | 0.002316 | 0.002316 | 0.000000 | 0.002316 | 0.000000 | 0.002316 |
| 3191 | 0.000000 | 0.006880 | 0.005692 | 0.000000 | 0.003457 | 0.005241 | 0.006813 | 0.000000 | 0.001873 | 0.000000 | ... | 0.010857 | 0.003815 | 0.021171 | 0.000000 | 0.000000 | 0.000000 | 0.003815 | 0.000000 | 0.000000 | 0.000000 |
| 3192 | 0.003366 | 0.000000 | 0.004816 | 0.004609 | 0.004700 | 0.015009 | 0.005170 | 0.006597 | 0.003493 | 0.001746 | ... | 0.000000 | 0.000000 | 0.007824 | 0.004705 | 0.000000 | 0.000738 | 0.000000 | 0.002076 | 0.000000 | 0.009315 |
| 3199 | 0.004374 | 0.007408 | 0.005917 | 0.007903 | 0.002941 | 0.010699 | 0.008695 | 0.005646 | 0.009059 | 0.014197 | ... | 0.001903 | 0.001175 | 0.003207 | 0.006340 | 0.008214 | 0.006850 | 0.000415 | 0.002745 | 0.002675 | 0.001782 |
| 3250 | 0.004133 | 0.018123 | 0.002079 | 0.022892 | 0.008372 | 0.002074 | 0.019889 | 0.002074 | 0.006328 | 0.004174 | ... | 0.000000 | 0.000000 | 0.004204 | 0.000000 | 0.004296 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.012613 |
| 3251 | 0.006054 | 0.010133 | 0.008176 | 0.005880 | 0.005100 | 0.005336 | 0.004149 | 0.004723 | 0.005424 | 0.006534 | ... | 0.006799 | 0.001920 | 0.000541 | 0.010186 | 0.003412 | 0.001670 | 0.001871 | 0.002968 | 0.002967 | 0.001553 |
| 19484 | 0.000000 | 0.011282 | 0.006236 | 0.000000 | 0.020720 | 0.003393 | 0.006878 | 0.010598 | 0.007479 | 0.001890 | ... | 0.000000 | 0.000000 | 0.002343 | 0.000000 | 0.000000 | 0.002343 | 0.000000 | 0.012743 | 0.020800 | 0.000000 |
| 19987 | 0.004113 | 0.003789 | 0.002411 | 0.015717 | 0.010104 | 0.005772 | 0.008337 | 0.007960 | 0.004121 | 0.008589 | ... | 0.004945 | 0.002879 | 0.005361 | 0.002353 | 0.002558 | 0.014937 | 0.001848 | 0.002943 | 0.000389 | 0.001020 |
| 33077 | 0.000000 | 0.011559 | 0.000000 | 0.002904 | 0.000000 | 0.005723 | 0.000000 | 0.000000 | 0.000000 | 0.023288 | ... | 0.000000 | 0.000000 | 0.023712 | 0.000000 | 0.000000 | 0.005928 | 0.000000 | 0.047424 | 0.000000 | 0.000000 |
| 60900 | 0.010154 | 0.011673 | 0.005251 | 0.015166 | 0.016750 | 0.004565 | 0.015933 | 0.011696 | 0.014317 | 0.003890 | ... | 0.005829 | 0.002000 | 0.002276 | 0.005535 | 0.004804 | 0.002889 | 0.001764 | 0.003771 | 0.002646 | 0.006895 |
| 61522 | 0.002880 | 0.008933 | 0.008071 | 0.014109 | 0.009320 | 0.005471 | 0.007913 | 0.003794 | 0.005171 | 0.007439 | ... | 0.002320 | 0.000514 | 0.002143 | 0.006499 | 0.001850 | 0.001813 | 0.000000 | 0.002230 | 0.003402 | 0.005628 |
| 62636 | 0.022174 | 0.000000 | 0.011033 | 0.005584 | 0.000000 | 0.000000 | 0.005503 | 0.005503 | 0.005598 | 0.016795 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.011400 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 62739 | 0.011025 | 0.009714 | 0.001509 | 0.002382 | 0.000855 | 0.010159 | 0.002933 | 0.008294 | 0.000857 | 0.000000 | ... | 0.012196 | 0.000000 | 0.000000 | 0.000000 | 0.003118 | 0.005862 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
45 rows × 1000 columns
DOC Table¶LIB table when books are docsbook_DOC = pd.DataFrame(index = mean_TFIDF.index)
book_DOC = book_DOC.join(LIB[['author', 'title']])
book_DOC['label'] = book_DOC.apply(lambda x: f"{x.author.split(',')[0]} {x.name}: {x.title}", 1)
book_DOC
| author | title | label | |
|---|---|---|---|
| book_id | |||
| 70 | twain | what is man | twain 70: what is man |
| 74 | twain | the adventures of tom sawyer | twain 74: the adventures of tom sawyer |
| 76 | twain | the adventures of huckleberry finn | twain 76: the adventures of huckleberry finn |
| 86 | twain | a connecticut yankee in king arthurs court | twain 86: a connecticut yankee in king arthurs... |
| 91 | twain | tom sawyer abroad | twain 91: tom sawyer abroad |
| 93 | twain | tom sawyer detective | twain 93: tom sawyer detective |
| 102 | twain | the tragedy of puddnhead wilson | twain 102: the tragedy of puddnhead wilson |
| 119 | twain | a tramp abroad | twain 119: a tramp abroad |
| 142 | twain | the 30000 bequest and other stories | twain 142: the 30000 bequest and other stories |
| 245 | twain | life on the mississippi | twain 245: life on the mississippi |
| 1044 | twain | extract from captain stormfields visit to Heaven | twain 1044: extract from captain stormfields v... |
| 1086 | twain | a horses tale | twain 1086: a horses tale |
| 1837 | twain | the prince and the pauper | twain 1837: the prince and the pauper |
| 2874 | twain | personal recollections of joan of arc vol 1 | twain 2874: personal recollections of joan of ... |
| 2875 | twain | personal recollections of joan of arc vol 2 | twain 2875: personal recollections of joan of ... |
| 2895 | twain | following the equator | twain 2895: following the equator |
| 3171 | twain | in defense of harriet shelley | twain 3171: in defense of harriet shelley |
| 3172 | twain | fenimore coopers literary offences | twain 3172: fenimore coopers literary offences |
| 3173 | twain | essays on paul bourget | twain 3173: essays on paul bourget |
| 3176 | twain | the innocents abroad | twain 3176: the innocents abroad |
| 3177 | twain | roughing it | twain 3177: roughing it |
| 3178 | twain | the gilded age | twain 3178: the gilded age |
| 3179 | twain | the american claimant | twain 3179: the american claimant |
| 3180 | twain | a double barrelled detective story | twain 3180: a double barrelled detective story |
| 3181 | twain | the stolen white elephant | twain 3181: the stolen white elephant |
| 3182 | twain | some rambling notes of an idle excursion | twain 3182: some rambling notes of an idle exc... |
| 3183 | twain | the facts concerning the recent carnival of cr... | twain 3183: the facts concerning the recent ca... |
| 3184 | twain | alonzo fitz and other stories | twain 3184: alonzo fitz and other stories |
| 3185 | twain | those extraordinary twins | twain 3185: those extraordinary twins |
| 3186 | twain | the mysterious stranger and other stories | twain 3186: the mysterious stranger and other ... |
| 3188 | twain | mark twain speeches | twain 3188: mark twain speeches |
| 3189 | twain | sketches new and old | twain 3189: sketches new and old |
| 3190 | twain | 1601 conversation as it was by the social fire... | twain 3190: 1601 conversation as it was by the... |
| 3191 | twain | goldsmiths friend abroad again | twain 3191: goldsmiths friend abroad again |
| 3192 | twain | the curious republic of gondour and other whim... | twain 3192: the curious republic of gondour an... |
| 3199 | twain | the letters of mark twain | twain 3199: the letters of mark twain |
| 3250 | twain | how to tell a story and other essays | twain 3250: how to tell a story and other essays |
| 3251 | twain | the man that corrupted hadleyburg and other st... | twain 3251: the man that corrupted hadleyburg ... |
| 19484 | twain | editorial wild oats | twain 19484: editorial wild oats |
| 19987 | twain | chapters from my autobiography | twain 19987: chapters from my autobiography |
| 33077 | twain | the treaty with china its provisions explained | twain 33077: the treaty with china its provisi... |
| 60900 | twain | merry tales | twain 60900: merry tales |
| 61522 | twain | the 1000000 bank note | twain 61522: the 1000000 bank note |
| 62636 | twain | to the person sitting in darkness | twain 62636: to the person sitting in darkness |
| 62739 | twain | king leopolds soliloquy | twain 62739: king leopolds soliloquy |
# binary table
L0 = mean_TFIDF_sigs.astype('bool').astype('int')
# Manhattan distance (L1 norm): divide each value by sum down cols
L1 = mean_TFIDF_sigs.apply(lambda x: x / x.sum(), 1)
# Euclidean distance (L2 norm)
L2 = mean_TFIDF_sigs.apply(lambda x: x / norm(x), 1) # Euclidean
assert round(L1.sum(1).sum()) == len(mean_TFIDF_sigs)
assert round(((L2.T)**2).sum().sum()) == len(mean_TFIDF_sigs)
PAIRS)¶mean_TFIDF_sigs.T.corr().stack()
book_id book_id
70 70 1.000000
74 0.065653
76 -0.006768
86 0.079010
91 -0.002018
...
62739 33077 0.158020
60900 -0.028375
61522 0.001523
62636 0.089411
62739 1.000000
Length: 2025, dtype: float64
# correlation between books --> stack and convert to df with col for raw correlation vals
PAIRS = 1 - mean_TFIDF_sigs.T.corr().stack().to_frame('corr_raw')
# rename indices
PAIRS.index.names = ['doc_a', 'doc_b']
# remove identities (e.g., corr(105, 105) and reverse dupliciates (e.g., corr(105, 121) = corr(121, 105))
PAIRS = PAIRS.query("doc_a > doc_b")
PAIRS
| corr_raw | ||
|---|---|---|
| doc_a | doc_b | |
| 74 | 70 | 0.934347 |
| 76 | 70 | 1.006768 |
| 74 | 0.500537 | |
| 86 | 70 | 0.920990 |
| 74 | 0.857285 | |
| ... | ... | ... |
| 62739 | 19987 | 0.915541 |
| 33077 | 0.841980 | |
| 60900 | 1.028375 | |
| 61522 | 0.998477 | |
| 62636 | 0.910589 |
990 rows × 1 columns
pdist()¶combos = [
(mean_TFIDF_sigs, 'cityblock', 'cityblock–raw'),
(mean_TFIDF_sigs, 'euclidean', 'euclidean–raw'),
(L2, 'euclidean', 'euclidean–l2'),
(mean_TFIDF_sigs, 'cosine', 'cosine–raw'),
(L1, 'cityblock', 'cityblock–l1'),
(L0, 'jaccard', 'jaccard–l0'),
(L0, 'jensenshannon', 'js–l0'),
(L1, 'jensenshannon', 'js–l1'),
(L2, 'jensenshannon', 'js–l2'),
]
for X, metric, label in combos:
PAIRS[label] = pdist(X, metric)
PAIRS.head(20)
| corr_raw | cityblock–raw | euclidean–raw | euclidean–l2 | cosine–raw | cityblock–l1 | jaccard–l0 | js–l0 | js–l1 | js–l2 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| doc_a | doc_b | ||||||||||
| 74 | 70 | 0.934347 | 4.546088 | 0.234767 | 0.939812 | 0.441623 | 0.810756 | 0.037000 | 0.114145 | 0.368884 | 0.368884 |
| 76 | 70 | 1.006768 | 4.975003 | 0.297118 | 1.093061 | 0.597391 | 0.996911 | 0.128000 | 0.216051 | 0.448822 | 0.448822 |
| 74 | 0.500537 | 3.477613 | 0.178243 | 0.844661 | 0.356726 | 0.648449 | 0.008000 | 0.052759 | 0.299841 | 0.299841 | |
| 86 | 70 | 0.920990 | 4.889381 | 0.292809 | 1.114627 | 0.621196 | 1.037877 | 0.233233 | 0.297915 | 0.466381 | 0.466381 |
| 74 | 0.857285 | 5.511088 | 0.338610 | 1.151917 | 0.663457 | 1.127231 | 0.311000 | 0.350048 | 0.509520 | 0.509520 | |
| 76 | 0.873024 | 4.661049 | 0.244347 | 0.926136 | 0.428864 | 0.754032 | 0.043000 | 0.123192 | 0.348537 | 0.348537 | |
| 91 | 70 | 1.002018 | 3.006633 | 0.140112 | 0.758177 | 0.287416 | 0.623418 | 0.006000 | 0.045661 | 0.283683 | 0.283683 |
| 74 | 0.504966 | 3.346067 | 0.161667 | 0.766304 | 0.293611 | 0.635085 | 0.009000 | 0.055975 | 0.287828 | 0.287828 | |
| 76 | 0.176894 | 3.242933 | 0.186845 | 0.881473 | 0.388497 | 0.644574 | 0.007000 | 0.049337 | 0.297017 | 0.297017 | |
| 86 | 0.851959 | 6.592539 | 0.435993 | 1.107597 | 0.613386 | 1.009056 | 0.301603 | 0.343793 | 0.469018 | 0.469018 | |
| 93 | 70 | 1.028131 | 4.887719 | 0.249661 | 1.002306 | 0.502308 | 0.993955 | 0.259519 | 0.315950 | 0.447095 | 0.447095 |
| 74 | 0.562528 | 4.077018 | 0.188483 | 0.923076 | 0.426035 | 0.823772 | 0.052000 | 0.135704 | 0.371631 | 0.371631 | |
| 76 | 0.168563 | 3.664222 | 0.168963 | 0.841515 | 0.354074 | 0.723915 | 0.030030 | 0.102677 | 0.331382 | 0.331382 | |
| 86 | 0.885217 | 3.639949 | 0.174221 | 0.860308 | 0.370065 | 0.747925 | 0.053000 | 0.137028 | 0.342492 | 0.342492 | |
| 91 | 0.261414 | 2.630389 | 0.129681 | 0.712435 | 0.253782 | 0.567135 | 0.005000 | 0.041665 | 0.259342 | 0.259342 | |
| 102 | 70 | 0.958330 | 5.277857 | 0.308049 | 0.978975 | 0.479196 | 0.912322 | 0.310241 | 0.349162 | 0.430125 | 0.430125 |
| 74 | 0.531515 | 5.342585 | 0.279884 | 1.101894 | 0.607085 | 1.197783 | 0.582915 | 0.513600 | 0.552857 | 0.552857 | |
| 76 | 0.559315 | 6.213281 | 0.375334 | 0.995281 | 0.495292 | 0.985214 | 0.393180 | 0.400941 | 0.462958 | 0.462958 | |
| 86 | 0.892180 | 3.245179 | 0.158915 | 0.843261 | 0.355545 | 0.689906 | 0.006000 | 0.045661 | 0.313367 | 0.313367 | |
| 91 | 0.586890 | 3.370931 | 0.159585 | 0.833132 | 0.347054 | 0.687526 | 0.006000 | 0.045661 | 0.310325 | 0.310325 |
LIB['label'] = book_DOC['label']
def hca(sims, title="My Dendrogram", linkage_method='weighted', color_thresh=None, figsize=(15, 20)):
# calculate linkage using given method
tree = sch.linkage(sims, method=linkage_method)
# extract labels (title, year)
labels = LIB.label.values
# set color threshold
if not color_thresh:
color_thresh = pd.DataFrame(tree)[2].median()
# plot dendrograms for each distance metric and linkage method
plt.figure()
fig, axes = plt.subplots(figsize=figsize)
dendrogram = sch.dendrogram(tree,
labels=labels,
orientation="left",
count_sort=True,
distance_sort=True,
above_threshold_color='.75',
color_threshold=color_thresh
)
plt.tick_params(axis='both', which='major', labelsize=14)
fig.suptitle(title, fontsize=20)
for combo in combos:
# column in df (i.e., distance metric)
m = combo[-1]
# two lnikage methods
for l in ['ward','weighted']:
# title: distance metric - linkage method
title = f"{m}–{l}"
hca(PAIRS[m], title, linkage_method=l)
/var/folders/3n/4b11y5qn5cn20kztppfbsxq40000gn/T/ipykernel_24732/163819248.py:14: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
# noun taglist (excluding proper nouns)
noun_tags = ['NN', 'NNS']
SIGS.loc[SIGS.max_pos.isin(noun_tags)].sort_values('dfidf', ascending = False).head(20)
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| indeed | 373 | 758 | 6 | 0.000255 | 11.936159 | NN | 12 | {'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'POS', 'IN'... | 0 | inde | inde | indee | 351 | 282734 | 266058 | 0.018638 | 0.220656 | 410.0 | 1.434262 | 588.047447 |
| door | 319 | 885 | 4 | 0.000298 | 11.712680 | NN | 8 | {'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'FW', 'JJ',... | 0 | door | door | door | 306 | 282315 | 270810 | 0.021854 | 0.290393 | 405.0 | 1.451964 | 588.045448 |
| deal | 414 | 682 | 4 | 0.000230 | 12.088585 | NN | 4 | {'VBP', 'VB', 'NN', 'NNP'} | 0 | deal | deal | deal | 385 | 282348 | 262570 | 0.015758 | 0.110058 | 411.0 | 1.430748 | 588.037256 |
| children | 317 | 890 | 8 | 0.000300 | 11.704552 | NNS | 9 | {'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'VBN', 'VBP... | 0 | children | children | childr | 305 | 282130 | 271450 | 0.021761 | 0.323451 | 404.0 | 1.455531 | 588.034396 |
| money | 244 | 1226 | 5 | 0.000413 | 11.242470 | NN | 12 | {'PDT', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'VBN... | 0 | money | money | money | 239 | 299144 | 293014 | 0.027533 | 0.190299 | 412.0 | 1.427242 | 588.023555 |
| ones | 402 | 704 | 4 | 0.000237 | 12.042782 | NNS | 16 | {'NNPS', 'RP', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP... | 0 | one | one | on | 375 | 283008 | 264000 | 0.016453 | 0.128745 | 403.0 | 1.459106 | 588.019773 |
| miles | 263 | 1117 | 5 | 0.000376 | 11.376800 | NNS | 9 | {'PDT', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'VBP... | 0 | mile | mile | mil | 257 | 293771 | 287069 | 0.027528 | 0.209469 | 401.0 | 1.466284 | 587.979780 |
| everybody | 339 | 842 | 9 | 0.000283 | 11.784537 | NN | 12 | {'RBR', 'RP', 'VBG', 'NNS', 'VB', 'VBD', 'NNP'... | 0 | everybodi | everybodi | everybody | 321 | 285438 | 270282 | 0.018832 | 0.137107 | 415.0 | 1.416775 | 587.961476 |
| family | 297 | 988 | 6 | 0.000333 | 11.553846 | NN | 10 | {'VBZ', 'NNS', 'VBD', 'NNP', 'VBN', 'RB', 'MD'... | 0 | famili | famili | famy | 287 | 293436 | 283556 | 0.021444 | 0.212516 | 415.0 | 1.416775 | 587.961476 |
| ground | 381 | 742 | 6 | 0.000250 | 11.966938 | NN | 9 | {'NNS', 'VB', 'VBD', 'NNP', 'VBN', 'VBG', 'VBP... | 0 | ground | ground | ground | 356 | 282702 | 264152 | 0.015985 | 0.100950 | 416.0 | 1.413302 | 587.933818 |
| friend | 344 | 817 | 6 | 0.000275 | 11.828021 | NN | 11 | {'PDT', 'VBZ', 'NNS', 'VB', 'VBD', 'NNP', 'IN'... | 0 | friend | friend | friend | 326 | 281048 | 266342 | 0.020920 | 0.144815 | 398.0 | 1.477118 | 587.892783 |
| none | 362 | 781 | 4 | 0.000263 | 11.893034 | NN | 11 | {'NNS', 'VB', 'CC', 'NNP', 'IN', 'VBN', 'RB', ... | 0 | none | none | non | 340 | 282722 | 265540 | 0.017026 | 0.099964 | 420.0 | 1.399497 | 587.788592 |
| chance | 386 | 727 | 6 | 0.000245 | 11.996402 | NN | 4 | {'VBP', 'NNP', 'JJ', 'NN'} | 0 | chanc | chanc | chant | 361 | 280622 | 262447 | 0.016715 | 0.236279 | 420.0 | 1.399497 | 587.788592 |
| state | 348 | 812 | 5 | 0.000273 | 11.836877 | NN | 5 | {'VBZ', 'VB', 'NNP', 'JJ', 'NN'} | 0 | state | state | stat | 328 | 282576 | 266336 | 0.018558 | 0.256574 | 420.0 | 1.399497 | 587.788592 |
| friends | 336 | 846 | 7 | 0.000285 | 11.777699 | NNS | 12 | {'NNPS', 'PDT', 'VBZ', 'NNS', 'VB', 'VBD', 'NN... | 0 | friend | friend | friend | 318 | 284256 | 269028 | 0.018750 | 0.116625 | 420.0 | 1.399497 | 587.788592 |
| air | 431 | 661 | 3 | 0.000223 | 12.133707 | NN | 9 | {'VBZ', 'VB', 'NNP', 'IN', 'FW', 'RB', 'VBP', ... | 0 | air | air | air | 396 | 284891 | 261756 | 0.016318 | 0.092915 | 392.0 | 1.499032 | 587.620670 |
| city | 267 | 1108 | 4 | 0.000373 | 11.388471 | NN | 7 | {'NNS', 'VB', 'NNP', 'VBP', 'JJ', 'NN', 'CD'} | 0 | citi | citi | city | 261 | 295836 | 289188 | 0.029952 | 0.281069 | 392.0 | 1.499032 | 587.620670 |
| words | 268 | 1107 | 5 | 0.000373 | 11.389774 | NNS | 15 | {'PDT', 'VBZ', 'NNS', 'WP', 'VB', 'VBD', 'NNP'... | 0 | word | word | word | 262 | 296676 | 290034 | 0.021894 | 0.272440 | 431.0 | 1.362198 | 587.107384 |
| hours | 316 | 901 | 5 | 0.000303 | 11.686830 | NNS | 9 | {'PDT', 'NNS', 'VB', 'VBD', 'NNP', 'VBP', 'JJ'... | 0 | hour | hour | hour | 304 | 284716 | 273904 | 0.017099 | 0.170275 | 431.0 | 1.362198 | 587.107384 |
| nobody | 395 | 711 | 6 | 0.000239 | 12.028507 | NN | 8 | {'VBZ', 'NNS', 'VB', 'NNP', 'RB', 'VBP', 'JJ',... | 0 | nobodi | nobodi | nobody | 370 | 280845 | 263070 | 0.019461 | 0.191097 | 384.0 | 1.528780 | 587.051391 |
top_20_nouns = list(VOCAB.loc[VOCAB.max_pos.isin(noun_tags)].sort_values('dfidf', ascending = False).head(20).index)
print(top_20_nouns)
['indeed', 'door', 'deal', 'children', 'money', 'ones', 'miles', 'family', 'everybody', 'ground', 'friend', 'state', 'none', 'friends', 'chance', 'air', 'city', 'words', 'hours', 'nobody']
BOW.groupby(BOOKS).mean().sort_values('tfidf', ascending = False).join(LIB, on = 'book_id')
| n | tf | tfidf | source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||||
| 3188 | 2.540928 | 0.054027 | 0.078733 | Twain/3188-mark_twain_speeches.txt | mark twain speeches | INTRODUCTION$|PREFACE$|THE STORY OF A SPEECH$|... | twain | non-fiction | 1880 | 1880 | 105 | 92256 | twain 3188: mark twain speeches |
| 3191 | 2.330049 | 0.043699 | 0.062809 | Twain/3191-goldsmiths_friend_abroad_again.txt | goldsmiths friend abroad again | LETTER\s[IVXLCM]+ | twain | stories | 1870 | 1870 | 7 | 6149 | twain 3191: goldsmiths friend abroad again |
| 1086 | 2.659144 | 0.040938 | 0.059174 | Twain/1086-a_horses_tale.txt | a horses tale | ^[IVXLCM]+$ | twain | novel | 1907 | 1900 | 15 | 17085 | twain 1086: a horses tale |
| 3192 | 2.431230 | 0.032768 | 0.057134 | Twain/3192-the_curious_republic_of_gondour_and... | the curious republic of gondour and other whim... | THE CURIOUS REPUBLIC OF GONDOUR|A MEMORY|INTRO... | twain | stories | 1919 | 1910 | 14 | 16722 | twain 3192: the curious republic of gondour an... |
| 3250 | 2.963259 | 0.032567 | 0.051521 | Twain/3250-how_to_tell_a_story_and_other_essay... | how to tell a story and other essays | HOW TO TELL A STORY$|THE WOUNDED SOLDIER.$|THE... | twain | non-fiction | 1897 | 1890 | 5 | 7420 | twain 3250: how to tell a story and other essays |
| 3189 | 2.875911 | 0.027712 | 0.045378 | Twain/3189-sketches_new_and_old.txt | sketches new and old | MY WATCH|POLITICAL ECONOMY|THE JUMPING FROG|JO... | twain | stories | 1916 | 1910 | 52 | 97108 | twain 3189: sketches new and old |
| 102 | 3.021061 | 0.026733 | 0.043308 | Twain/102-the_tragedy_of_puddnhead_wilson.txt | the tragedy of puddnhead wilson | ^(?:A Whisper|CHAPTER\s[IVXLCM]+\.|CONCLUSION)$ | twain | novel | 1894 | 1890 | 22 | 53935 | twain 102: the tragedy of puddnhead wilson |
| 19484 | 2.700829 | 0.026177 | 0.042429 | Twain/19484-editorial_wild_oats.txt | editorial wild oats | ^My First Literary Venture$|^Journalism in Ten... | twain | stories | 1875 | 1870 | 5 | 9777 | twain 19484: editorial wild oats |
| 74 | 2.786740 | 0.026721 | 0.041384 | Twain/74-the_adventures_of_tom_sawyer.txt | the adventures of tom sawyer | ^\s*CHAPTER\s*[IVXLCM]+$ | twain | novel | 1876 | 1870 | 35 | 70276 | twain 74: the adventures of tom sawyer |
| 3180 | 2.894254 | 0.028208 | 0.040873 | Twain/3180-a_double_barrelled_detective_story.txt | a double barrelled detective story | ^[IVXLCM]+[\.]?$ | twain | stories | 1902 | 1900 | 10 | 19542 | twain 3180: a double barrelled detective story |
| 3185 | 2.962158 | 0.026052 | 0.039382 | Twain/3185-those_extraordinary_twins.txt | those extraordinary twins | ^CHAPTER\s[IVXLCM]+\.\s[A-Z]+ | twain | stories | 1892 | 1890 | 10 | 20039 | twain 3185: those extraordinary twins |
| 3190 | 2.793029 | 0.015731 | 0.037841 | Twain/3190-1601_conversation_as_it_was_by_the_... | 1601 conversation as it was by the social fire... | ^(INTRODUCTION|THE FIRST PRINTING|FOOTNOTES|PA... | twain | stories | 1880 | 1880 | 4 | 11700 | twain 3190: 1601 conversation as it was by the... |
| 3179 | 3.006103 | 0.026183 | 0.037614 | Twain/3179-the_american_claimant.txt | the american claimant | ^(CHAPTER\s[IVXLCM]+|APPENDIX)\.$ | twain | novel | 1892 | 1890 | 26 | 64036 | twain 3179: the american claimant |
| 62739 | 2.846308 | 0.019321 | 0.037133 | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | 1900 | 6 | 12797 | twain 62739: king leopolds soliloquy |
| 2875 | 2.898887 | 0.026507 | 0.037126 | Twain/2875-personal_recollections_of_joan_of_a... | personal recollections of joan of arc vol 2 | ^[0-9]+\s[A-Z]+ | twain | non-fiction | 1896 | 1890 | 38 | 71618 | twain 2875: personal recollections of joan of ... |
| 3181 | 2.742546 | 0.021657 | 0.036272 | Twain/3181-the_stolen_white_elephant.txt | the stolen white elephant | ^[IVXLCM]+[\.]?$ | twain | stories | 1882 | 1880 | 3 | 6807 | twain 3181: the stolen white elephant |
| 3177 | 2.671287 | 0.021494 | 0.036157 | Twain/3177-roughing_it.txt | roughing it | ^(CHAPTER\s[IVXLCM]+|APPENDIX)\.$ | twain | novel | 1872 | 1870 | 79 | 165350 | twain 3177: roughing it |
| 3184 | 3.026914 | 0.023980 | 0.035678 | Twain/3184-alonzo_fitz_and_other_stories.txt | alonzo fitz and other stories | THE LOVES OF ALONZO FITZ CLARENCE AND ROSANNAH... | twain | stories | 1878 | 1870 | 13 | 30366 | twain 3184: alonzo fitz and other stories |
| 1837 | 2.736118 | 0.021699 | 0.035655 | Twain/1837-the_prince_and_the_pauper.txt | the prince and the pauper | ^\s*CHAPTER\s*[IVXLCM]+ | twain | novel | 1881 | 1880 | 33 | 69786 | twain 1837: the prince and the pauper |
| 2874 | 3.034438 | 0.024597 | 0.033642 | Twain/2874-personal_recollections_of_joan_of_a... | personal recollections of joan of arc vol 1 | ^Chapter\s[0-9]+ | twain | non-fiction | 1896 | 1890 | 35 | 77803 | twain 2874: personal recollections of joan of ... |
| 60900 | 3.801300 | 0.014654 | 0.033563 | Twain/60900-merry_tales.txt | merry tales | ^THE PRIVATE HISTORY OF A CAMPAIGN THAT FAILED... | twain | stories | 1892 | 1890 | 6 | 36846 | twain 60900: merry tales |
| 245 | 2.905452 | 0.021770 | 0.033280 | Twain/245-life_on_the_mississippi.txt | life on the mississippi | ^(THE 'BODY OF THE NATION'|CHAPTER\s[0-9]+|APP... | twain | non-fiction | 1883 | 1880 | 65 | 145691 | twain 245: life on the mississippi |
| 76 | 3.836670 | 0.026139 | 0.033157 | Twain/76-the_adventures_of_huckleberry_finn.txt | the adventures of huckleberry finn | ^\s*CHAPTER\s*(?:[IVXLCM]+\.|THE LAST)$ | twain | novel | 1884 | 1880 | 43 | 111908 | twain 76: the adventures of huckleberry finn |
| 93 | 3.698687 | 0.027781 | 0.033141 | Twain/93-tom_sawyer_detective.txt | tom sawyer detective | ^CHAPTER\s[IVXLCM]+\.\s[A-Z] | twain | novel | 1896 | 1890 | 11 | 23372 | twain 93: tom sawyer detective |
| 3178 | 3.009788 | 0.022064 | 0.032297 | Twain/3178-the_gilded_age.txt | the gilded age | ^(CHAPTER\s[IVXLCM]+|APPENDIX)\.$ | twain | novel | 1873 | 1870 | 64 | 160518 | twain 3178: the gilded age |
| 86 | 3.058316 | 0.022648 | 0.031542 | Twain/86-a_connecticut_yankee_in_king_arthurs_... | a connecticut yankee in king arthurs court | ^\s*(?:PREFACE|A WORD OF EXPLANATION|THE STRAN... | twain | novel | 1889 | 1880 | 47 | 119100 | twain 86: a connecticut yankee in king arthurs... |
| 91 | 3.753066 | 0.023020 | 0.029092 | Twain/91-tom_sawyer_abroad.txt | tom sawyer abroad | CHAPTER\s[IVXLCM]+\. | twain | novel | 1894 | 1890 | 13 | 33969 | twain 91: tom sawyer abroad |
| 142 | 3.774277 | 0.018769 | 0.028576 | Twain/142-the_30000_bequest_and_other_stories.txt | the 30000 bequest and other stories | THE \$30,000 BEQUEST$|A DOG'S TALE$|WAS IT HEA... | twain | stories | 1906 | 1900 | 25 | 93670 | twain 142: the 30000 bequest and other stories |
| 2895 | 2.888958 | 0.016800 | 0.027830 | Twain/2895-following_the_equator.txt | following the equator | ^(CHAPTER[,]?\s[IVXLCM]+|CONCLUSION)\.$ | twain | non-fiction | 1897 | 1890 | 71 | 190158 | twain 2895: following the equator |
| 119 | 2.962312 | 0.016666 | 0.026501 | Twain/119-a_tramp_abroad.txt | a tramp abroad | ^(?:CHAPTER\s[IVXLCM]+|APPENDIX\s[A-Z]\.)$ | twain | non-fiction | 1880 | 1880 | 55 | 159402 | twain 119: a tramp abroad |
| 3173 | 3.853003 | 0.017103 | 0.026088 | Twain/3173-essays_on_paul_bourget.txt | essays on paul bourget | (WHAT PAUL BOURGET|A LITTLE NOTE TO) | twain | non-fiction | 1890 | 1890 | 2 | 11035 | twain 3173: essays on paul bourget |
| 3171 | 3.679526 | 0.015155 | 0.026074 | Twain/3171-in_defense_of_harriet_shelley.txt | in defense of harriet shelley | ^[IVXLCM]+$ | twain | non-fiction | 1918 | 1910 | 3 | 15833 | twain 3171: in defense of harriet shelley |
| 19987 | 3.472678 | 0.018381 | 0.026042 | Twain/19987-chapters_from_my_autobiography.txt | chapters from my autobiography | ^(INTRODUCTION|[IVXLCM]+|CHAPTERS FROM MY AUTO... | twain | non-fiction | 1906 | 1900 | 31 | 110834 | twain 19987: chapters from my autobiography |
| 3176 | 2.915266 | 0.015238 | 0.025448 | Twain/3176-the_innocents_abroad.txt | the innocents abroad | ^\s*(CHAPTER\s*[IVXLCM]+\.$|CONCLUSION) | twain | non-fiction | 1869 | 1860 | 62 | 193699 | twain 3176: the innocents abroad |
| 3186 | 3.631648 | 0.020284 | 0.025365 | Twain/3186-the_mysterious_stranger_and_other_s... | the mysterious stranger and other stories | ^(Chapter\s[0-9]+|A FABLE|HUNTING THE DECEITFU... | twain | stories | 1916 | 1910 | 14 | 41793 | twain 3186: the mysterious stranger and other ... |
| 62636 | 3.318565 | 0.012764 | 0.022179 | Twain/62636-to_the_person_sitting_in_darkness.txt | to the person sitting in darkness | ^Extending the Blessings | twain | non-fiction | 1901 | 1900 | 1 | 4719 | twain 62636: to the person sitting in darkness |
| 3182 | 3.176685 | 0.013717 | 0.021264 | Twain/3182-some_rambling_notes_of_an_idle_excu... | some rambling notes of an idle excursion | ^[IVXLCM]+\.$ | twain | non-fiction | 1877 | 1870 | 4 | 16595 | twain 3182: some rambling notes of an idle exc... |
| 3183 | 3.909091 | 0.015390 | 0.020693 | Twain/3183-the_facts_concerning_the_recent_car... | the facts concerning the recent carnival of cr... | I was feeling blithe | twain | stories | 1877 | 1870 | 1 | 6579 | twain 3183: the facts concerning the recent ca... |
| 70 | 4.404473 | 0.012261 | 0.019698 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain 70: what is man |
| 1044 | 4.635578 | 0.013983 | 0.019182 | Twain/1044-extract_from_captain_stormfields_vi... | extract from captain stormfields visit to Heaven | CHAPTER\s[IVXLCM]+$ | twain | stories | 1909 | 1900 | 2 | 15010 | twain 1044: extract from captain stormfields v... |
| 3172 | 3.585507 | 0.010577 | 0.018147 | Twain/3172-fenimore_coopers_literary_offences.txt | fenimore coopers literary offences | The Pathfinder and The Deerslayer | twain | non-fiction | 1895 | 1890 | 1 | 4948 | twain 3172: fenimore coopers literary offences |
| 3251 | 4.639982 | 0.011230 | 0.016951 | Twain/3251-the_man_that_corrupted_hadleyburg_a... | the man that corrupted hadleyburg and other st... | ^THE MAN THAT CORRUPTED HADLEYBURG$|^MY FIRST ... | twain | stories | 1900 | 1900 | 15 | 112965 | twain 3251: the man that corrupted hadleyburg ... |
| 3199 | 4.475668 | 0.010489 | 0.016611 | Twain/3199-the_letters_of_mark_twain.txt | the letters of mark twain | ^[IVXLCM]+\. [A-Z]+\s | twain | non-fiction | 1853 | 1850 | 31 | 272698 | twain 3199: the letters of mark twain |
| 33077 | 3.772847 | 0.007546 | 0.014359 | Twain/33077-the_treaty_with_china_its_provisio... | the treaty with china its provisions explained | ^New York Tribune | twain | non-fiction | 1868 | 1860 | 1 | 7142 | twain 33077: the treaty with china its provisi... |
| 61522 | 4.577857 | 0.007410 | 0.011398 | Twain/61522-the_1000000_bank_note.txt | the 1000000 bank note | ^_THE £1,000,000 BANK-NOTE_$|^_METNAL TELEGRAP... | twain | stories | 1893 | 1890 | 6 | 65207 | twain 61522: the 1000000 bank note |
# merge PAIRS, LIB to add label col twice (for doc_a, doc_b) to include author, book_id, title
DISTS = pd.merge(PAIRS.reset_index(), LIB['label'], left_on = 'doc_a', right_on = 'book_id', how = 'left')
DISTS = pd.merge(DISTS, LIB['label'], left_on = 'doc_b', right_on = 'book_id', how = 'right')
DISTS = DISTS.set_index(['doc_a', 'doc_b']).rename({'label_x': 'label_a', 'label_y': 'label_b'}, axis = 1)
# reorder df columns so that label_a and label_b first
DISTS.insert(loc = 0, column = 'label_a', value = DISTS.pop('label_a'))
DISTS.insert(loc = 1, column = 'label_b', value = DISTS.pop('label_b'))
DISTS.head(20).style.background_gradient(cmap='YlGnBu', high=.5, axis=0)
| label_a | label_b | corr_raw | cityblock–raw | euclidean–raw | euclidean–l2 | cosine–raw | cityblock–l1 | jaccard–l0 | js–l0 | js–l1 | js–l2 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| doc_a | doc_b | ||||||||||||
| 74.0 | 70.0 | twain 74: the adventures of tom sawyer | twain 70: what is man | 0.934347 | 4.546088 | 0.234767 | 0.939812 | 0.441623 | 0.810756 | 0.037000 | 0.114145 | 0.368884 | 0.368884 |
| 76.0 | 70.0 | twain 76: the adventures of huckleberry finn | twain 70: what is man | 1.006768 | 4.975003 | 0.297118 | 1.093061 | 0.597391 | 0.996911 | 0.128000 | 0.216051 | 0.448822 | 0.448822 |
| 86.0 | 70.0 | twain 86: a connecticut yankee in king arthurs court | twain 70: what is man | 0.920990 | 4.889381 | 0.292809 | 1.114627 | 0.621196 | 1.037877 | 0.233233 | 0.297915 | 0.466381 | 0.466381 |
| 91.0 | 70.0 | twain 91: tom sawyer abroad | twain 70: what is man | 1.002018 | 3.006633 | 0.140112 | 0.758177 | 0.287416 | 0.623418 | 0.006000 | 0.045661 | 0.283683 | 0.283683 |
| 93.0 | 70.0 | twain 93: tom sawyer detective | twain 70: what is man | 1.028131 | 4.887719 | 0.249661 | 1.002306 | 0.502308 | 0.993955 | 0.259519 | 0.315950 | 0.447095 | 0.447095 |
| 102.0 | 70.0 | twain 102: the tragedy of puddnhead wilson | twain 70: what is man | 0.958330 | 5.277857 | 0.308049 | 0.978975 | 0.479196 | 0.912322 | 0.310241 | 0.349162 | 0.430125 | 0.430125 |
| 119.0 | 70.0 | twain 119: a tramp abroad | twain 70: what is man | 0.889692 | 4.536682 | 0.225397 | 0.870263 | 0.378679 | 0.717113 | 0.031000 | 0.104361 | 0.328523 | 0.328523 |
| 142.0 | 70.0 | twain 142: the 30000 bequest and other stories | twain 70: what is man | 0.749916 | 3.710108 | 0.191821 | 0.896461 | 0.401821 | 0.754953 | 0.058058 | 0.143522 | 0.350748 | 0.350748 |
| 245.0 | 70.0 | twain 245: life on the mississippi | twain 70: what is man | 0.914644 | 3.392531 | 0.163527 | 0.783583 | 0.307001 | 0.645965 | 0.009000 | 0.055975 | 0.294312 | 0.294312 |
| 1044.0 | 70.0 | twain 1044: extract from captain stormfields visit to Heaven | twain 70: what is man | 0.987545 | 3.968964 | 0.228053 | 0.881571 | 0.388584 | 0.670278 | 0.035000 | 0.110927 | 0.313311 | 0.313311 |
| 1086.0 | 70.0 | twain 1086: a horses tale | twain 70: what is man | 0.891234 | 4.154492 | 0.229774 | 0.903064 | 0.407763 | 0.718509 | 0.051205 | 0.134949 | 0.344474 | 0.344474 |
| 1837.0 | 70.0 | twain 1837: the prince and the pauper | twain 70: what is man | 1.016241 | 6.885359 | 0.392070 | 1.178571 | 0.694515 | 1.190786 | 0.484232 | 0.457185 | 0.547425 | 0.547425 |
| 2874.0 | 70.0 | twain 2874: personal recollections of joan of arc vol 1 | twain 70: what is man | 0.921028 | 5.983874 | 0.382246 | 1.125937 | 0.633867 | 1.067707 | 0.429154 | 0.424008 | 0.498159 | 0.498159 |
| 2875.0 | 70.0 | twain 2875: personal recollections of joan of arc vol 2 | twain 70: what is man | 0.874359 | 4.595717 | 0.274207 | 0.993646 | 0.493666 | 0.887029 | 0.124000 | 0.212252 | 0.407892 | 0.407892 |
| 2895.0 | 70.0 | twain 2895: following the equator | twain 70: what is man | 0.746805 | 4.854033 | 0.274275 | 0.973064 | 0.473426 | 0.888924 | 0.124000 | 0.212252 | 0.404087 | 0.404087 |
| 3171.0 | 70.0 | twain 3171: in defense of harriet shelley | twain 70: what is man | 0.810398 | 6.024164 | 0.419678 | 1.190627 | 0.708796 | 1.151072 | 0.447537 | 0.440022 | 0.531670 | 0.531670 |
| 3172.0 | 70.0 | twain 3172: fenimore coopers literary offences | twain 70: what is man | 0.944777 | 5.018517 | 0.245490 | 0.947697 | 0.449065 | 0.933607 | 0.259259 | 0.315684 | 0.427653 | 0.427653 |
| 3173.0 | 70.0 | twain 3173: essays on paul bourget | twain 70: what is man | 0.785908 | 4.344996 | 0.232379 | 0.910531 | 0.414533 | 0.776906 | 0.201000 | 0.274618 | 0.376146 | 0.376146 |
| 3176.0 | 70.0 | twain 3176: the innocents abroad | twain 70: what is man | 0.936830 | 5.354566 | 0.267776 | 0.888264 | 0.394507 | 0.933060 | 0.245187 | 0.307883 | 0.427854 | 0.427854 |
| 3177.0 | 70.0 | twain 3177: roughing it | twain 70: what is man | 0.981081 | 5.007724 | 0.273885 | 0.941828 | 0.443520 | 0.950650 | 0.311966 | 0.357770 | 0.442515 | 0.442515 |
ZPAIRS = (PAIRS - PAIRS.mean()) / PAIRS.std()
ZPAIRS
| corr_raw | cityblock–raw | euclidean–raw | euclidean–l2 | cosine–raw | cityblock–l1 | jaccard–l0 | js–l0 | js–l1 | js–l2 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| doc_a | doc_b | ||||||||||
| 74 | 70 | 0.201052 | -0.616593 | -0.623747 | -0.440606 | -0.516566 | -0.710584 | -1.275181 | -1.294188 | -0.784200 | -0.784200 |
| 76 | 70 | 0.831302 | -0.291196 | 0.087454 | 0.572656 | 0.538375 | 0.064428 | -0.831873 | -0.657971 | -0.032502 | -0.032502 |
| 74 | -3.574216 | -1.427193 | -1.268485 | -1.069729 | -1.091531 | -1.386310 | -1.416455 | -1.677430 | -1.433436 | -1.433436 | |
| 86 | 70 | 0.084815 | -0.356153 | 0.038305 | 0.715245 | 0.699595 | 0.234977 | -0.319228 | -0.146881 | 0.132609 | 0.132609 |
| 74 | -0.469586 | 0.115506 | 0.560733 | 0.961805 | 0.985803 | 0.606981 | 0.059614 | 0.178596 | 0.538261 | 0.538261 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 19987 | 0.037397 | 1.906864 | 1.518612 | 1.177751 | 1.244211 | 1.500613 | 1.585456 | 1.369783 | 1.383700 | 1.383700 |
| 33077 | -0.602780 | 0.723475 | 0.241571 | 0.926680 | 0.944455 | 0.780878 | 0.284041 | 0.373695 | 0.716812 | 0.716812 | |
| 60900 | 1.019339 | 1.072599 | 1.019166 | 0.915623 | 0.931478 | 1.284908 | 1.583729 | 1.355870 | 1.221409 | 1.221409 | |
| 61522 | 0.759148 | -0.323417 | -0.658215 | 0.479398 | 0.434635 | 0.528006 | 0.242222 | 0.326035 | 0.443758 | 0.443758 | |
| 62636 | -0.005703 | 0.528097 | 1.206798 | 1.262290 | 1.347341 | 1.425303 | 1.582722 | 1.447839 | 1.470754 | 1.470754 |
990 rows × 10 columns
ZPAIRS.T.sort_index().T.plot.box(rot = 45, figsize = (15,7));
sns.pairplot(ZPAIRS);
n_clusters = 4
# instantiate KMeans model
km = KMeans(n_clusters, random_state = 314)
# compute cluster centers and predict cluster index for each sample using raw and normalized feature vectors
book_DOC['y_raw'] = km.fit_predict(mean_TFIDF_sigs)
book_DOC['y_L0'] = km.fit_predict(L0)
book_DOC['y_L1'] = km.fit_predict(L1)
book_DOC['y_L2'] = km.fit_predict(L2)
book_DOC.iloc[:,1:].sort_values('label').style.background_gradient(cmap = 'RdBu')
| title | label | y_raw | y_L0 | y_L1 | y_L2 | |
|---|---|---|---|---|---|---|
| book_id | ||||||
| 102 | the tragedy of puddnhead wilson | twain 102: the tragedy of puddnhead wilson | 0 | 0 | 0 | 1 |
| 1044 | extract from captain stormfields visit to Heaven | twain 1044: extract from captain stormfields visit to Heaven | 2 | 3 | 3 | 2 |
| 1086 | a horses tale | twain 1086: a horses tale | 0 | 0 | 0 | 1 |
| 119 | a tramp abroad | twain 119: a tramp abroad | 0 | 0 | 0 | 0 |
| 142 | the 30000 bequest and other stories | twain 142: the 30000 bequest and other stories | 0 | 0 | 0 | 0 |
| 1837 | the prince and the pauper | twain 1837: the prince and the pauper | 0 | 0 | 0 | 1 |
| 19484 | editorial wild oats | twain 19484: editorial wild oats | 0 | 1 | 0 | 0 |
| 19987 | chapters from my autobiography | twain 19987: chapters from my autobiography | 0 | 0 | 0 | 0 |
| 245 | life on the mississippi | twain 245: life on the mississippi | 0 | 0 | 0 | 0 |
| 2874 | personal recollections of joan of arc vol 1 | twain 2874: personal recollections of joan of arc vol 1 | 0 | 0 | 0 | 1 |
| 2875 | personal recollections of joan of arc vol 2 | twain 2875: personal recollections of joan of arc vol 2 | 0 | 0 | 0 | 1 |
| 2895 | following the equator | twain 2895: following the equator | 0 | 0 | 0 | 0 |
| 3171 | in defense of harriet shelley | twain 3171: in defense of harriet shelley | 0 | 1 | 0 | 0 |
| 3172 | fenimore coopers literary offences | twain 3172: fenimore coopers literary offences | 0 | 2 | 0 | 0 |
| 3173 | essays on paul bourget | twain 3173: essays on paul bourget | 0 | 1 | 0 | 0 |
| 3176 | the innocents abroad | twain 3176: the innocents abroad | 0 | 0 | 0 | 0 |
| 3177 | roughing it | twain 3177: roughing it | 0 | 0 | 0 | 0 |
| 3178 | the gilded age | twain 3178: the gilded age | 0 | 0 | 0 | 0 |
| 3179 | the american claimant | twain 3179: the american claimant | 0 | 0 | 0 | 1 |
| 3180 | a double barrelled detective story | twain 3180: a double barrelled detective story | 0 | 0 | 0 | 1 |
| 3181 | the stolen white elephant | twain 3181: the stolen white elephant | 0 | 1 | 0 | 0 |
| 3182 | some rambling notes of an idle excursion | twain 3182: some rambling notes of an idle excursion | 0 | 0 | 0 | 0 |
| 3183 | the facts concerning the recent carnival of crime in connecticut | twain 3183: the facts concerning the recent carnival of crime in connecticut | 3 | 1 | 0 | 1 |
| 3184 | alonzo fitz and other stories | twain 3184: alonzo fitz and other stories | 0 | 0 | 0 | 0 |
| 3185 | those extraordinary twins | twain 3185: those extraordinary twins | 0 | 0 | 0 | 1 |
| 3186 | the mysterious stranger and other stories | twain 3186: the mysterious stranger and other stories | 0 | 0 | 0 | 1 |
| 3188 | mark twain speeches | twain 3188: mark twain speeches | 0 | 0 | 0 | 0 |
| 3189 | sketches new and old | twain 3189: sketches new and old | 0 | 0 | 0 | 0 |
| 3190 | 1601 conversation as it was by the social fireside in the time of the tudors | twain 3190: 1601 conversation as it was by the social fireside in the time of the tudors | 0 | 1 | 0 | 0 |
| 3191 | goldsmiths friend abroad again | twain 3191: goldsmiths friend abroad again | 0 | 1 | 0 | 0 |
| 3192 | the curious republic of gondour and other whimsical sketches | twain 3192: the curious republic of gondour and other whimsical sketches | 0 | 0 | 0 | 0 |
| 3199 | the letters of mark twain | twain 3199: the letters of mark twain | 0 | 0 | 0 | 0 |
| 3250 | how to tell a story and other essays | twain 3250: how to tell a story and other essays | 0 | 3 | 0 | 0 |
| 3251 | the man that corrupted hadleyburg and other stories | twain 3251: the man that corrupted hadleyburg and other stories | 0 | 0 | 0 | 0 |
| 33077 | the treaty with china its provisions explained | twain 33077: the treaty with china its provisions explained | 0 | 2 | 1 | 3 |
| 60900 | merry tales | twain 60900: merry tales | 0 | 0 | 0 | 1 |
| 61522 | the 1000000 bank note | twain 61522: the 1000000 bank note | 0 | 0 | 0 | 0 |
| 62636 | to the person sitting in darkness | twain 62636: to the person sitting in darkness | 1 | 2 | 2 | 0 |
| 62739 | king leopolds soliloquy | twain 62739: king leopolds soliloquy | 0 | 1 | 0 | 0 |
| 70 | what is man | twain 70: what is man | 0 | 0 | 0 | 0 |
| 74 | the adventures of tom sawyer | twain 74: the adventures of tom sawyer | 0 | 0 | 0 | 1 |
| 76 | the adventures of huckleberry finn | twain 76: the adventures of huckleberry finn | 2 | 0 | 3 | 2 |
| 86 | a connecticut yankee in king arthurs court | twain 86: a connecticut yankee in king arthurs court | 0 | 0 | 0 | 1 |
| 91 | tom sawyer abroad | twain 91: tom sawyer abroad | 2 | 3 | 3 | 2 |
| 93 | tom sawyer detective | twain 93: tom sawyer detective | 2 | 3 | 3 | 2 |
# k values to test
k_vals = list(range(2, 11))
# different feature vectors to use
feature_vectors = {'raw': mean_TFIDF_sigs,
'L0': L0,
'L1': L1,
'L2': L2}
# empty dataframe
km_results = pd.DataFrame(columns = ['k', 'raw_silhouette_score', 'L0_silhouette_score', 'L1_silhouette_score', 'L2_silhouette_score'])
# loop through k values (num of clusters) and compute silhouette score to find best of combo of k, feature vector
for k in k_vals:
km = KMeans(k, random_state = 314)
results = [k]
for vec in feature_vectors.values():
labels = km.fit_predict(vec)
results.append(silhouette_score(vec, labels))
km_results.loc[len(km_results)] = results
km_results.style.background_gradient(cmap = 'RdBu', axis = None, subset = km_results.columns[1:])
| k | raw_silhouette_score | L0_silhouette_score | L1_silhouette_score | L2_silhouette_score | |
|---|---|---|---|---|---|
| 0 | 2.000000 | 0.255373 | 0.377931 | 0.236509 | 0.091829 |
| 1 | 3.000000 | 0.259579 | 0.342163 | 0.242996 | 0.064479 |
| 2 | 4.000000 | 0.248763 | 0.337037 | 0.240644 | 0.054186 |
| 3 | 5.000000 | 0.138738 | 0.336095 | 0.247383 | 0.054036 |
| 4 | 6.000000 | 0.245425 | 0.336220 | 0.057003 | 0.042060 |
| 5 | 7.000000 | 0.142032 | 0.327749 | 0.219367 | 0.047595 |
| 6 | 8.000000 | 0.136787 | 0.335951 | 0.047044 | 0.057804 |
| 7 | 9.000000 | 0.072876 | 0.330097 | 0.045014 | 0.042156 |
| 8 | 10.000000 | 0.049228 | 0.311804 | 0.087053 | 0.044631 |
km_results.iloc[:, 1:].idxmax()
raw_silhouette_score 1 L0_silhouette_score 0 L1_silhouette_score 3 L2_silhouette_score 0 dtype: int64
# overall highest silhouette score
max_silhouette_score = km_results.iloc[:,1:].max().max()
# k value (num of clusters) corresponding to the highest silhouette score
max_score_cluster = km_results.loc[km_results[km_results == max_silhouette_score].any(axis = 1)]['k'].iloc[0]
# feature vector corresponding to the highest silhouette score
max_score_vec = km_results.loc[km_results[km_results == max_silhouette_score].any(axis = 1)].iloc[:, 1:].idxmax(axis = 1).iloc[0]
# create a col with labels corresponding to k value, feature vector that yield highest silhouette score
km = KMeans(int(max_score_cluster), random_state = 314)
max_col_name = 'max_y_{}'.format(max_score_vec.split('_')[0])
book_DOC[max_col_name] = km.fit_predict(feature_vectors[max_score_vec.split('_')[0]])
# add to see cluster breakdown by type
book_DOC = book_DOC.join(LIB['type'])
book_DOC[['label', 'type', max_col_name]].sort_values(max_col_name).style.background_gradient(cmap = 'RdBu')
| label | type | max_y_L0 | |
|---|---|---|---|
| book_id | |||
| 62739 | twain 62739: king leopolds soliloquy | stories | 0 |
| 62636 | twain 62636: to the person sitting in darkness | non-fiction | 0 |
| 3173 | twain 3173: essays on paul bourget | non-fiction | 0 |
| 3172 | twain 3172: fenimore coopers literary offences | non-fiction | 0 |
| 3171 | twain 3171: in defense of harriet shelley | non-fiction | 0 |
| 3190 | twain 3190: 1601 conversation as it was by the social fireside in the time of the tudors | stories | 0 |
| 3183 | twain 3183: the facts concerning the recent carnival of crime in connecticut | stories | 0 |
| 1044 | twain 1044: extract from captain stormfields visit to Heaven | stories | 0 |
| 3191 | twain 3191: goldsmiths friend abroad again | stories | 0 |
| 19484 | twain 19484: editorial wild oats | stories | 0 |
| 93 | twain 93: tom sawyer detective | novel | 0 |
| 33077 | twain 33077: the treaty with china its provisions explained | non-fiction | 0 |
| 3250 | twain 3250: how to tell a story and other essays | non-fiction | 0 |
| 3181 | twain 3181: the stolen white elephant | stories | 0 |
| 3199 | twain 3199: the letters of mark twain | non-fiction | 1 |
| 3251 | twain 3251: the man that corrupted hadleyburg and other stories | stories | 1 |
| 3192 | twain 3192: the curious republic of gondour and other whimsical sketches | stories | 1 |
| 3189 | twain 3189: sketches new and old | stories | 1 |
| 3188 | twain 3188: mark twain speeches | non-fiction | 1 |
| 19987 | twain 19987: chapters from my autobiography | non-fiction | 1 |
| 60900 | twain 60900: merry tales | stories | 1 |
| 3186 | twain 3186: the mysterious stranger and other stories | stories | 1 |
| 3185 | twain 3185: those extraordinary twins | stories | 1 |
| 3184 | twain 3184: alonzo fitz and other stories | stories | 1 |
| 61522 | twain 61522: the 1000000 bank note | stories | 1 |
| 70 | twain 70: what is man | non-fiction | 1 |
| 3180 | twain 3180: a double barrelled detective story | stories | 1 |
| 74 | twain 74: the adventures of tom sawyer | novel | 1 |
| 76 | twain 76: the adventures of huckleberry finn | novel | 1 |
| 86 | twain 86: a connecticut yankee in king arthurs court | novel | 1 |
| 91 | twain 91: tom sawyer abroad | novel | 1 |
| 102 | twain 102: the tragedy of puddnhead wilson | novel | 1 |
| 119 | twain 119: a tramp abroad | non-fiction | 1 |
| 142 | twain 142: the 30000 bequest and other stories | stories | 1 |
| 3182 | twain 3182: some rambling notes of an idle excursion | non-fiction | 1 |
| 245 | twain 245: life on the mississippi | non-fiction | 1 |
| 1837 | twain 1837: the prince and the pauper | novel | 1 |
| 2874 | twain 2874: personal recollections of joan of arc vol 1 | non-fiction | 1 |
| 2875 | twain 2875: personal recollections of joan of arc vol 2 | non-fiction | 1 |
| 2895 | twain 2895: following the equator | non-fiction | 1 |
| 3176 | twain 3176: the innocents abroad | non-fiction | 1 |
| 3177 | twain 3177: roughing it | novel | 1 |
| 3178 | twain 3178: the gilded age | novel | 1 |
| 1086 | twain 1086: a horses tale | novel | 1 |
| 3179 | twain 3179: the american claimant | novel | 1 |
book_DOC.groupby(max_col_name).size()
max_y_L0 0 14 1 31 dtype: int64
# cluster breakdown by type
book_DOC.groupby(['type', max_col_name]).size()
type max_y_L0
non-fiction 0 6
1 11
novel 0 1
1 10
stories 0 7
1 10
dtype: int64
TFIDF_sigs
| term_str | saying | seem | indeed | couldnt | door | taken | deal | fifty | getting | perhaps | ... | shame | dawn | privilege | loved | busy | record | watched | laws | information | questions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| 2 | 0.005272 | 0.005285 | 0.005246 | 0.010622 | 0.001062 | 0.004187 | 0.002093 | 0.004187 | 0.002130 | 0.014907 | ... | 0.013009 | 0.000000 | 0.002168 | 0.002168 | 0.002168 | 0.006505 | 0.006505 | 0.006505 | 0.0 | 0.002168 | |
| 3 | 0.006831 | 0.000000 | 0.000000 | 0.006881 | 0.055051 | 0.000000 | 0.000000 | 0.006781 | 0.006898 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.042142 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 4 | 0.005859 | 0.000000 | 0.005830 | 0.005902 | 0.000000 | 0.000000 | 0.011632 | 0.017448 | 0.000000 | 0.011834 | ... | 0.000000 | 0.012049 | 0.024098 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.012049 | |
| 5 | 0.008429 | 0.012674 | 0.004194 | 0.012737 | 0.004246 | 0.004183 | 0.004183 | 0.012550 | 0.008512 | 0.008512 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.016015 | 0.008027 | 0.003984 | 0.004033 | 0.000000 | 0.003974 | 0.003974 | 0.000000 | 0.000000 | 0.000000 | ... | 0.024700 | 0.000000 | 0.000000 | 0.000000 | 0.008233 | 0.024700 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| 3 | 0.045041 | 0.045152 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.044711 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 4 | 0.005093 | 0.005106 | 0.005068 | 0.010261 | 0.005131 | 0.000000 | 0.000000 | 0.005056 | 0.005143 | 0.000000 | ... | 0.010473 | 0.000000 | 0.000000 | 0.000000 | 0.010473 | 0.010473 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.013626 | 0.013626 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 6 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.043356 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
1108 rows × 1000 columns
chap_DOC = pd.DataFrame(index = TFIDF.index)
chap_DOC = chap_DOC.join(LIB[['author', 'title', 'type', 'decade']], on = 'book_id')
chap_DOC['label'] = chap_DOC.apply(lambda x: "{}-{}-{}".format(x.name[0], x.author, x.name[1]), 1)
chap_DOC['mean_tfidf'] = TFIDF.mean(1)
chap_DOC['n_tokens'] = BOW.groupby(OHCO[:2]).n.sum()
chap_DOC
| author | title | type | decade | label | mean_tfidf | n_tokens | ||
|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||
| 70 | 1 | twain | what is man | non-fiction | 1900 | 70-twain-1 | 0.000506 | 71 |
| 2 | twain | what is man | non-fiction | 1900 | 70-twain-2 | 0.000605 | 26698 | |
| 3 | twain | what is man | non-fiction | 1900 | 70-twain-3 | 0.000507 | 4367 | |
| 4 | twain | what is man | non-fiction | 1900 | 70-twain-4 | 0.000380 | 3669 | |
| 5 | twain | what is man | non-fiction | 1900 | 70-twain-5 | 0.000419 | 5465 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | twain | king leopolds soliloquy | stories | 1900 | 62739-twain-2 | 0.000536 | 6390 |
| 3 | twain | king leopolds soliloquy | stories | 1900 | 62739-twain-3 | 0.000782 | 686 | |
| 4 | twain | king leopolds soliloquy | stories | 1900 | 62739-twain-4 | 0.000420 | 3033 | |
| 5 | twain | king leopolds soliloquy | stories | 1900 | 62739-twain-5 | 0.000354 | 1129 | |
| 6 | twain | king leopolds soliloquy | stories | 1900 | 62739-twain-6 | 0.000428 | 356 |
1108 rows × 7 columns
TFIDF_sigs
| term_str | saying | seem | indeed | couldnt | door | taken | deal | fifty | getting | perhaps | ... | shame | dawn | privilege | loved | busy | record | watched | laws | information | questions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| 2 | 0.005272 | 0.005285 | 0.005246 | 0.010622 | 0.001062 | 0.004187 | 0.002093 | 0.004187 | 0.002130 | 0.014907 | ... | 0.013009 | 0.000000 | 0.002168 | 0.002168 | 0.002168 | 0.006505 | 0.006505 | 0.006505 | 0.0 | 0.002168 | |
| 3 | 0.006831 | 0.000000 | 0.000000 | 0.006881 | 0.055051 | 0.000000 | 0.000000 | 0.006781 | 0.006898 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.042142 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 4 | 0.005859 | 0.000000 | 0.005830 | 0.005902 | 0.000000 | 0.000000 | 0.011632 | 0.017448 | 0.000000 | 0.011834 | ... | 0.000000 | 0.012049 | 0.024098 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.012049 | |
| 5 | 0.008429 | 0.012674 | 0.004194 | 0.012737 | 0.004246 | 0.004183 | 0.004183 | 0.012550 | 0.008512 | 0.008512 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.016015 | 0.008027 | 0.003984 | 0.004033 | 0.000000 | 0.003974 | 0.003974 | 0.000000 | 0.000000 | 0.000000 | ... | 0.024700 | 0.000000 | 0.000000 | 0.000000 | 0.008233 | 0.024700 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
| 3 | 0.045041 | 0.045152 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.044711 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 4 | 0.005093 | 0.005106 | 0.005068 | 0.010261 | 0.005131 | 0.000000 | 0.000000 | 0.005056 | 0.005143 | 0.000000 | ... | 0.010473 | 0.000000 | 0.000000 | 0.000000 | 0.010473 | 0.010473 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.013626 | 0.013626 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | |
| 6 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.043356 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 |
1108 rows × 1000 columns
LOADINGS, DCM, COMPINF = get_pca(TFIDF_sigs, norm_docs = True, center_by_mean = False, center_by_variance = False)
px.scatter(DCM, 0, 1, color=chap_DOC.type,
size=np.abs(chap_DOC.mean_tfidf), hover_name=chap_DOC.label,
marginal_x='box', marginal_y='box', height=1000)
px.scatter(DCM, 2, 3, color=chap_DOC.type,
size=np.abs(chap_DOC.mean_tfidf), hover_name=chap_DOC.label,
marginal_x='box', marginal_y='box', height=1000)
X = LOADINGS.join(SIGS, how='inner').reset_index()
px.scatter(X, 0, 1, size=X.n, color=X.dfidf,
hover_name='term_str', hover_data=['max_pos'],
marginal_x='box', marginal_y='box',
height=1000, width=1000)
COMPINF
| pos | neg | eig_val | exp_var | |
|---|---|---|---|---|
| pc_id | ||||
| 0 | aint says dont reckon thats | city feet government french war | 0.025783 | 0.224980 |
| 1 | river boat water feet miles | letter letters wrote book am | 0.016925 | 0.147683 |
| 2 | sir court face voice father | letter letters book river wrote | 0.013273 | 0.115819 |
| 3 | bill american sir honor government | letter feet oh mother boys | 0.012755 | 0.111299 |
| 4 | says city aint church sea | river boat sir letter shore | 0.009164 | 0.079961 |
| 5 | sir government dollars bill money | letter letters book boys mother | 0.008131 | 0.070952 |
| 6 | sir letter ship sea letters | river french government boat war | 0.007849 | 0.068485 |
| 7 | dollars money boys silver boy | says sir french ship sea | 0.007382 | 0.064418 |
| 8 | ship boat sea dont oh | says dollars letter silver gold | 0.006870 | 0.059950 |
| 9 | ship horse french yes army | says father sir woman river | 0.006470 | 0.056452 |
pca = PCA(
n_components=6,
n_iter=3,
rescale_with_mean=False, # Already set and applied to TFIDF
rescale_with_std=False, # Already set and applied to TFIDF
copy=True,
check_input=True,
engine='auto',
random_state=42
)
pca = pca.fit(TFIDF)
dcm = pca.transform(TFIDF)
dcm
| 0 | 1 | 2 | 3 | 4 | 5 | ||
|---|---|---|---|---|---|---|---|
| book_id | chap_id | ||||||
| 70 | 1 | 0.113931 | 0.302671 | -0.170870 | 0.040518 | -0.037079 | -0.075492 |
| 2 | 0.120829 | 0.241899 | -0.146759 | -0.027912 | -0.016165 | 0.163990 | |
| 3 | 0.162728 | 0.294099 | -0.166797 | -0.021350 | -0.015659 | -0.026615 | |
| 4 | 0.075285 | 0.150127 | -0.085885 | -0.019615 | -0.021718 | 0.001746 | |
| 5 | 0.091257 | 0.178199 | -0.095310 | -0.023824 | -0.011637 | -0.013031 | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.102761 | 0.195586 | -0.099366 | -0.025425 | -0.013681 | -0.010889 |
| 3 | 0.138763 | 0.272401 | -0.149855 | -0.030387 | -0.053265 | -0.013249 | |
| 4 | 0.055947 | 0.124940 | -0.060920 | -0.019205 | -0.005833 | -0.005138 | |
| 5 | 0.061812 | 0.135055 | -0.077031 | -0.020244 | -0.007853 | -0.007598 | |
| 6 | 0.065727 | 0.158000 | -0.077146 | -0.014555 | -0.012263 | 0.021367 |
1108 rows × 6 columns
px.scatter(dcm, 0, 1,
color=chap_DOC.type,
size=chap_DOC.n_tokens, hover_name=chap_DOC.label,
height=1000, width=1200,
marginal_x='box', marginal_y='box')
Note that the chapter numbers listed below are one chapter greater than those in the book (e.g., Chapter 4 below is actually Chapter 14) but there is an intro
# function to calculate the upper fence / bound in the box plots above for the different PCs
def upper_fence(df, books, pc):
pc_IQR = df.loc[books, pc].quantile(0.75) - df.loc[books, pc].quantile(0.25)
return 1.5 * pc_IQR + df.loc[books, pc].quantile(0.75)
# upper fences for pc 0
twain_0_upper_fence = upper_fence(dcm, LIB.index.values, 0)
# upper fences for pc 1
twain_1_upper_fence = upper_fence(dcm, LIB.index.values, 1)
# outliers the chapters in books with PC 0 or PC 1 greater than the max of the upper fence for dickens and twain for each PC
outliers = dcm.loc[(dcm[0] > twain_0_upper_fence) | (dcm[1] > twain_1_upper_fence)].index.values
# known Twain outliers from experimentation (see twain_analysis_M7.ipynb)
twain_outliers = [(70, 2), (76, 4), (76, 8), (76, 15), (76, 23), (76, 38), (76, 43), (3188, 10), (102, 4), (102, 9), (102, 10),
(102, 15), (102, 19), (142, 16), (3180, 10), (3250, 3), (60900, 6), (3188, 2), (3188, 69), (3189, 3), (3189, 7), (3189, 31)]
# remove outliers from corpus
small_CORPUS = CORPUS.loc[~CORPUS.index.droplevel(['para_num', 'sent_num', 'token_num']) \
.isin(outliers)]
# remove known Twain outliers from corpus
small_CORPUS = small_CORPUS.loc[~small_CORPUS.index.droplevel(['para_num', 'sent_num', 'token_num']) \
.isin(twain_outliers)]
# remove outliers from vocab
small_VOCAB = VOCAB.loc[VOCAB.index.isin(small_CORPUS.term_str)]
# remove proper nounrs
proper_nouns = ['NNP', 'NNPS']
small_VOCAB = VOCAB.loc[~VOCAB.max_pos.isin(proper_nouns)]
# remove ~18% of VOCAB data
(VOCAB.shape[0] - small_VOCAB.shape[0]) / VOCAB.shape[0]
0.17647714190217997
# remove proper nouns from corpus
small_CORPUS = small_CORPUS.loc[small_CORPUS.term_str.isin(small_VOCAB.index.values)]
# remove ~11% of data
(CORPUS.shape[0] - small_CORPUS.shape[0]) / CORPUS.shape[0]
0.10910914381979803
small_BOW = create_bow(small_CORPUS, CHAPS)
# suppress chained assignment warning
pd.options.mode.chained_assignment = None
small_DTCM, small_TFIDF, small_BOW, small_DFIDF, small_VOCAB = get_tfidf(small_BOW, small_VOCAB, tf_method = 'max', idf_method = 'standard')
small_chap_DOC = pd.DataFrame(index = small_TFIDF.index)
small_chap_DOC = small_chap_DOC.join(LIB[['author', 'title', 'type', 'decade']], on = 'book_id')
small_chap_DOC['label'] = small_chap_DOC.apply(lambda x: "{}-{}-{}".format(x.name[0], x.author, x.name[1]), 1)
small_chap_DOC['mean_tfidf'] = TFIDF.mean(1)
small_chap_DOC['n_tokens'] = small_BOW.groupby(OHCO[:2]).n.sum()
small_pca = pca.fit(small_TFIDF)
small_dcm = pca.transform(small_TFIDF)
px.scatter(small_dcm, 0, 1,
color=small_chap_DOC.type,
size=small_chap_DOC.n_tokens, hover_name=small_chap_DOC.label,
height=1000, width=1200,
marginal_x='box', marginal_y='box')
TFIDF_sigs)¶pca_sigs = pca.fit(TFIDF_sigs)
dcm_sigs = pca_sigs.transform(TFIDF_sigs)
dcm_sigs
| 0 | 1 | 2 | 3 | 4 | 5 | ||
|---|---|---|---|---|---|---|---|
| book_id | chap_id | ||||||
| 70 | 1 | 0.056153 | -0.023210 | -0.013009 | 0.004759 | -0.005821 | 0.004595 |
| 2 | 0.180922 | -0.038227 | 0.022268 | -0.039044 | -0.009613 | 0.004492 | |
| 3 | 0.220261 | -0.079028 | 0.039250 | 0.012342 | 0.013575 | -0.047303 | |
| 4 | 0.133059 | -0.036709 | -0.007816 | 0.012793 | 0.007986 | 0.016466 | |
| 5 | 0.154748 | -0.040718 | -0.017498 | 0.031480 | 0.008514 | -0.012437 | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.168050 | -0.050184 | -0.021159 | 0.007675 | -0.006063 | 0.012953 |
| 3 | 0.159232 | -0.048936 | 0.003923 | -0.028582 | 0.020467 | 0.037295 | |
| 4 | 0.072618 | -0.030855 | 0.001835 | 0.009470 | 0.000830 | 0.018404 | |
| 5 | 0.089742 | -0.041856 | 0.026121 | -0.001859 | -0.024549 | 0.000448 | |
| 6 | 0.064286 | -0.040945 | 0.010348 | 0.004402 | -0.011561 | 0.000698 |
1108 rows × 6 columns
px.scatter(dcm_sigs, 0, 1,
color=chap_DOC.type,
size=chap_DOC.n_tokens, hover_name=chap_DOC.label,
height=1000, width=1200,
marginal_x='box', marginal_y='box')
# save BOW for topic modeling
BOW.to_csv(f'twain_BOW.csv')
pandas.DataFrame.droplevel to drop one or more levels of a MultiIndex: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.droplevel.htmlMultiIndex levels: https://stackoverflow.com/questions/52798386/pandas-dataframe-how-to-retrieve-specific-combinations-of-multiindex-levelsMultiIndex: https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201dsklearn.metrics.silhouette_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.htmlbackground_gradient: https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.background_gradient.html